Index: projects/clang391-import/contrib/llvm/include/llvm/Support/Threading.h =================================================================== --- projects/clang391-import/contrib/llvm/include/llvm/Support/Threading.h (revision 309436) +++ projects/clang391-import/contrib/llvm/include/llvm/Support/Threading.h (revision 309437) @@ -1,120 +1,120 @@ //===-- llvm/Support/Threading.h - Control multithreading mode --*- C++ -*-===// // // The LLVM Compiler Infrastructure // // This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// // // This file declares helper functions for running LLVM in a multi-threaded // environment. // //===----------------------------------------------------------------------===// #ifndef LLVM_SUPPORT_THREADING_H #define LLVM_SUPPORT_THREADING_H #include "llvm/Config/llvm-config.h" // for LLVM_ON_UNIX #include "llvm/Support/Compiler.h" #include // So we can check the C++ standard lib macros. #include -// We use std::call_once on all Unix platforms except for NetBSD with -// libstdc++. That platform has a bug they are working to fix, and they'll -// remove the NetBSD checks once fixed. -#if defined(LLVM_ON_UNIX) && \ - !(defined(__NetBSD__) && !defined(_LIBCPP_VERSION)) && !defined(__ppc__) +// std::call_once from libc++ is used on all Unix platforms. Other +// implementations like libstdc++ are known to have problems on NetBSD, +// OpenBSD and PowerPC. +#if defined(LLVM_ON_UNIX) && (defined(_LIBCPP_VERSION) || \ + !(defined(__NetBSD__) || defined(__OpenBSD__) || defined(__ppc__))) #define LLVM_THREADING_USE_STD_CALL_ONCE 1 #else #define LLVM_THREADING_USE_STD_CALL_ONCE 0 #endif #if LLVM_THREADING_USE_STD_CALL_ONCE #include #else #include "llvm/Support/Atomic.h" #endif namespace llvm { /// Returns true if LLVM is compiled with support for multi-threading, and /// false otherwise. bool llvm_is_multithreaded(); /// llvm_execute_on_thread - Execute the given \p UserFn on a separate /// thread, passing it the provided \p UserData and waits for thread /// completion. /// /// This function does not guarantee that the code will actually be executed /// on a separate thread or honoring the requested stack size, but tries to do /// so where system support is available. /// /// \param UserFn - The callback to execute. /// \param UserData - An argument to pass to the callback function. /// \param RequestedStackSize - If non-zero, a requested size (in bytes) for /// the thread stack. void llvm_execute_on_thread(void (*UserFn)(void*), void *UserData, unsigned RequestedStackSize = 0); #if LLVM_THREADING_USE_STD_CALL_ONCE typedef std::once_flag once_flag; /// This macro is the only way you should define your once flag for LLVM's /// call_once. #define LLVM_DEFINE_ONCE_FLAG(flag) static once_flag flag #else enum InitStatus { Uninitialized = 0, Wait = 1, Done = 2 }; typedef volatile sys::cas_flag once_flag; /// This macro is the only way you should define your once flag for LLVM's /// call_once. #define LLVM_DEFINE_ONCE_FLAG(flag) static once_flag flag = Uninitialized #endif /// \brief Execute the function specified as a parameter once. /// /// Typical usage: /// \code /// void foo() {...}; /// ... /// LLVM_DEFINE_ONCE_FLAG(flag); /// call_once(flag, foo); /// \endcode /// /// \param flag Flag used for tracking whether or not this has run. /// \param F Function to call once. template void call_once(once_flag &flag, Function &&F, Args &&... ArgList) { #if LLVM_THREADING_USE_STD_CALL_ONCE std::call_once(flag, std::forward(F), std::forward(ArgList)...); #else // For other platforms we use a generic (if brittle) version based on our // atomics. sys::cas_flag old_val = sys::CompareAndSwap(&flag, Wait, Uninitialized); if (old_val == Uninitialized) { std::forward(F)(std::forward(ArgList)...); sys::MemoryFence(); TsanIgnoreWritesBegin(); TsanHappensBefore(&flag); flag = Done; TsanIgnoreWritesEnd(); } else { // Wait until any thread doing the call has finished. sys::cas_flag tmp = flag; sys::MemoryFence(); while (tmp != Done) { tmp = flag; sys::MemoryFence(); } } TsanHappensAfter(&flag); #endif } } #endif Index: projects/clang391-import/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp =================================================================== --- projects/clang391-import/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp (revision 309436) +++ projects/clang391-import/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp (revision 309437) @@ -1,3391 +1,3396 @@ //===----- LegalizeIntegerTypes.cpp - Legalization of integer types -------===// // // The LLVM Compiler Infrastructure // // This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// // // This file implements integer type expansion and promotion for LegalizeTypes. // Promotion is the act of changing a computation in an illegal type into a // computation in a larger type. For example, implementing i8 arithmetic in an // i32 register (often needed on powerpc). // Expansion is the act of changing a computation in an illegal type into a // computation in two identical registers of a smaller type. For example, // implementing i64 arithmetic in two i32 registers (often needed on 32-bit // targets). // //===----------------------------------------------------------------------===// #include "LegalizeTypes.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" using namespace llvm; #define DEBUG_TYPE "legalize-types" //===----------------------------------------------------------------------===// // Integer Result Promotion //===----------------------------------------------------------------------===// /// PromoteIntegerResult - This method is called when a result of a node is /// found to be in need of promotion to a larger type. At this point, the node /// may also have invalid operands or may have other results that need /// expansion, we just know that (at least) one result needs promotion. void DAGTypeLegalizer::PromoteIntegerResult(SDNode *N, unsigned ResNo) { DEBUG(dbgs() << "Promote integer result: "; N->dump(&DAG); dbgs() << "\n"); SDValue Res = SDValue(); // See if the target wants to custom expand this node. if (CustomLowerNode(N, N->getValueType(ResNo), true)) return; switch (N->getOpcode()) { default: #ifndef NDEBUG dbgs() << "PromoteIntegerResult #" << ResNo << ": "; N->dump(&DAG); dbgs() << "\n"; #endif llvm_unreachable("Do not know how to promote this operator!"); case ISD::MERGE_VALUES:Res = PromoteIntRes_MERGE_VALUES(N, ResNo); break; case ISD::AssertSext: Res = PromoteIntRes_AssertSext(N); break; case ISD::AssertZext: Res = PromoteIntRes_AssertZext(N); break; case ISD::BITCAST: Res = PromoteIntRes_BITCAST(N); break; case ISD::BITREVERSE: Res = PromoteIntRes_BITREVERSE(N); break; case ISD::BSWAP: Res = PromoteIntRes_BSWAP(N); break; case ISD::BUILD_PAIR: Res = PromoteIntRes_BUILD_PAIR(N); break; case ISD::Constant: Res = PromoteIntRes_Constant(N); break; case ISD::CONVERT_RNDSAT: Res = PromoteIntRes_CONVERT_RNDSAT(N); break; case ISD::CTLZ_ZERO_UNDEF: case ISD::CTLZ: Res = PromoteIntRes_CTLZ(N); break; case ISD::CTPOP: Res = PromoteIntRes_CTPOP(N); break; case ISD::CTTZ_ZERO_UNDEF: case ISD::CTTZ: Res = PromoteIntRes_CTTZ(N); break; case ISD::EXTRACT_VECTOR_ELT: Res = PromoteIntRes_EXTRACT_VECTOR_ELT(N); break; case ISD::LOAD: Res = PromoteIntRes_LOAD(cast(N)); break; case ISD::MLOAD: Res = PromoteIntRes_MLOAD(cast(N)); break; case ISD::MGATHER: Res = PromoteIntRes_MGATHER(cast(N)); break; case ISD::SELECT: Res = PromoteIntRes_SELECT(N); break; case ISD::VSELECT: Res = PromoteIntRes_VSELECT(N); break; case ISD::SELECT_CC: Res = PromoteIntRes_SELECT_CC(N); break; case ISD::SETCC: Res = PromoteIntRes_SETCC(N); break; case ISD::SMIN: case ISD::SMAX: Res = PromoteIntRes_SExtIntBinOp(N); break; case ISD::UMIN: case ISD::UMAX: Res = PromoteIntRes_ZExtIntBinOp(N); break; case ISD::SHL: Res = PromoteIntRes_SHL(N); break; case ISD::SIGN_EXTEND_INREG: Res = PromoteIntRes_SIGN_EXTEND_INREG(N); break; case ISD::SRA: Res = PromoteIntRes_SRA(N); break; case ISD::SRL: Res = PromoteIntRes_SRL(N); break; case ISD::TRUNCATE: Res = PromoteIntRes_TRUNCATE(N); break; case ISD::UNDEF: Res = PromoteIntRes_UNDEF(N); break; case ISD::VAARG: Res = PromoteIntRes_VAARG(N); break; case ISD::EXTRACT_SUBVECTOR: Res = PromoteIntRes_EXTRACT_SUBVECTOR(N); break; case ISD::VECTOR_SHUFFLE: Res = PromoteIntRes_VECTOR_SHUFFLE(N); break; case ISD::INSERT_VECTOR_ELT: Res = PromoteIntRes_INSERT_VECTOR_ELT(N); break; case ISD::BUILD_VECTOR: Res = PromoteIntRes_BUILD_VECTOR(N); break; case ISD::SCALAR_TO_VECTOR: Res = PromoteIntRes_SCALAR_TO_VECTOR(N); break; case ISD::CONCAT_VECTORS: Res = PromoteIntRes_CONCAT_VECTORS(N); break; case ISD::SIGN_EXTEND: case ISD::ZERO_EXTEND: case ISD::ANY_EXTEND: Res = PromoteIntRes_INT_EXTEND(N); break; case ISD::FP_TO_SINT: case ISD::FP_TO_UINT: Res = PromoteIntRes_FP_TO_XINT(N); break; case ISD::FP_TO_FP16: Res = PromoteIntRes_FP_TO_FP16(N); break; case ISD::AND: case ISD::OR: case ISD::XOR: case ISD::ADD: case ISD::SUB: case ISD::MUL: Res = PromoteIntRes_SimpleIntBinOp(N); break; case ISD::SDIV: case ISD::SREM: Res = PromoteIntRes_SExtIntBinOp(N); break; case ISD::UDIV: case ISD::UREM: Res = PromoteIntRes_ZExtIntBinOp(N); break; case ISD::SADDO: case ISD::SSUBO: Res = PromoteIntRes_SADDSUBO(N, ResNo); break; case ISD::UADDO: case ISD::USUBO: Res = PromoteIntRes_UADDSUBO(N, ResNo); break; case ISD::SMULO: case ISD::UMULO: Res = PromoteIntRes_XMULO(N, ResNo); break; case ISD::ATOMIC_LOAD: Res = PromoteIntRes_Atomic0(cast(N)); break; case ISD::ATOMIC_LOAD_ADD: case ISD::ATOMIC_LOAD_SUB: case ISD::ATOMIC_LOAD_AND: case ISD::ATOMIC_LOAD_OR: case ISD::ATOMIC_LOAD_XOR: case ISD::ATOMIC_LOAD_NAND: case ISD::ATOMIC_LOAD_MIN: case ISD::ATOMIC_LOAD_MAX: case ISD::ATOMIC_LOAD_UMIN: case ISD::ATOMIC_LOAD_UMAX: case ISD::ATOMIC_SWAP: Res = PromoteIntRes_Atomic1(cast(N)); break; case ISD::ATOMIC_CMP_SWAP: case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: Res = PromoteIntRes_AtomicCmpSwap(cast(N), ResNo); break; } // If the result is null then the sub-method took care of registering it. if (Res.getNode()) SetPromotedInteger(SDValue(N, ResNo), Res); } SDValue DAGTypeLegalizer::PromoteIntRes_MERGE_VALUES(SDNode *N, unsigned ResNo) { SDValue Op = DisintegrateMERGE_VALUES(N, ResNo); return GetPromotedInteger(Op); } SDValue DAGTypeLegalizer::PromoteIntRes_AssertSext(SDNode *N) { // Sign-extend the new bits, and continue the assertion. SDValue Op = SExtPromotedInteger(N->getOperand(0)); return DAG.getNode(ISD::AssertSext, SDLoc(N), Op.getValueType(), Op, N->getOperand(1)); } SDValue DAGTypeLegalizer::PromoteIntRes_AssertZext(SDNode *N) { // Zero the new bits, and continue the assertion. SDValue Op = ZExtPromotedInteger(N->getOperand(0)); return DAG.getNode(ISD::AssertZext, SDLoc(N), Op.getValueType(), Op, N->getOperand(1)); } SDValue DAGTypeLegalizer::PromoteIntRes_Atomic0(AtomicSDNode *N) { EVT ResVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0)); SDValue Res = DAG.getAtomic(N->getOpcode(), SDLoc(N), N->getMemoryVT(), ResVT, N->getChain(), N->getBasePtr(), N->getMemOperand(), N->getOrdering(), N->getSynchScope()); // Legalize the chain result - switch anything that used the old chain to // use the new one. ReplaceValueWith(SDValue(N, 1), Res.getValue(1)); return Res; } SDValue DAGTypeLegalizer::PromoteIntRes_Atomic1(AtomicSDNode *N) { SDValue Op2 = GetPromotedInteger(N->getOperand(2)); SDValue Res = DAG.getAtomic(N->getOpcode(), SDLoc(N), N->getMemoryVT(), N->getChain(), N->getBasePtr(), Op2, N->getMemOperand(), N->getOrdering(), N->getSynchScope()); // Legalize the chain result - switch anything that used the old chain to // use the new one. ReplaceValueWith(SDValue(N, 1), Res.getValue(1)); return Res; } SDValue DAGTypeLegalizer::PromoteIntRes_AtomicCmpSwap(AtomicSDNode *N, unsigned ResNo) { if (ResNo == 1) { assert(N->getOpcode() == ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS); EVT SVT = getSetCCResultType(N->getOperand(2).getValueType()); EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(1)); // Only use the result of getSetCCResultType if it is legal, // otherwise just use the promoted result type (NVT). if (!TLI.isTypeLegal(SVT)) SVT = NVT; SDVTList VTs = DAG.getVTList(N->getValueType(0), SVT, MVT::Other); SDValue Res = DAG.getAtomicCmpSwap( ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, SDLoc(N), N->getMemoryVT(), VTs, N->getChain(), N->getBasePtr(), N->getOperand(2), N->getOperand(3), N->getMemOperand(), N->getSuccessOrdering(), N->getFailureOrdering(), N->getSynchScope()); ReplaceValueWith(SDValue(N, 0), Res.getValue(0)); ReplaceValueWith(SDValue(N, 2), Res.getValue(2)); return Res.getValue(1); } SDValue Op2 = GetPromotedInteger(N->getOperand(2)); SDValue Op3 = GetPromotedInteger(N->getOperand(3)); SDVTList VTs = DAG.getVTList(Op2.getValueType(), N->getValueType(1), MVT::Other); SDValue Res = DAG.getAtomicCmpSwap( N->getOpcode(), SDLoc(N), N->getMemoryVT(), VTs, N->getChain(), N->getBasePtr(), Op2, Op3, N->getMemOperand(), N->getSuccessOrdering(), N->getFailureOrdering(), N->getSynchScope()); // Update the use to N with the newly created Res. for (unsigned i = 1, NumResults = N->getNumValues(); i < NumResults; ++i) ReplaceValueWith(SDValue(N, i), Res.getValue(i)); return Res; } SDValue DAGTypeLegalizer::PromoteIntRes_BITCAST(SDNode *N) { SDValue InOp = N->getOperand(0); EVT InVT = InOp.getValueType(); EVT NInVT = TLI.getTypeToTransformTo(*DAG.getContext(), InVT); EVT OutVT = N->getValueType(0); EVT NOutVT = TLI.getTypeToTransformTo(*DAG.getContext(), OutVT); SDLoc dl(N); switch (getTypeAction(InVT)) { case TargetLowering::TypeLegal: break; case TargetLowering::TypePromoteInteger: if (NOutVT.bitsEq(NInVT) && !NOutVT.isVector() && !NInVT.isVector()) // The input promotes to the same size. Convert the promoted value. return DAG.getNode(ISD::BITCAST, dl, NOutVT, GetPromotedInteger(InOp)); break; case TargetLowering::TypeSoftenFloat: // Promote the integer operand by hand. return DAG.getNode(ISD::ANY_EXTEND, dl, NOutVT, GetSoftenedFloat(InOp)); case TargetLowering::TypePromoteFloat: { // Convert the promoted float by hand. SDValue PromotedOp = GetPromotedFloat(InOp); return DAG.getNode(ISD::FP_TO_FP16, dl, NOutVT, PromotedOp); break; } case TargetLowering::TypeExpandInteger: case TargetLowering::TypeExpandFloat: break; case TargetLowering::TypeScalarizeVector: // Convert the element to an integer and promote it by hand. if (!NOutVT.isVector()) return DAG.getNode(ISD::ANY_EXTEND, dl, NOutVT, BitConvertToInteger(GetScalarizedVector(InOp))); break; case TargetLowering::TypeSplitVector: { // For example, i32 = BITCAST v2i16 on alpha. Convert the split // pieces of the input into integers and reassemble in the final type. SDValue Lo, Hi; GetSplitVector(N->getOperand(0), Lo, Hi); Lo = BitConvertToInteger(Lo); Hi = BitConvertToInteger(Hi); if (DAG.getDataLayout().isBigEndian()) std::swap(Lo, Hi); InOp = DAG.getNode(ISD::ANY_EXTEND, dl, EVT::getIntegerVT(*DAG.getContext(), NOutVT.getSizeInBits()), JoinIntegers(Lo, Hi)); return DAG.getNode(ISD::BITCAST, dl, NOutVT, InOp); } case TargetLowering::TypeWidenVector: // The input is widened to the same size. Convert to the widened value. // Make sure that the outgoing value is not a vector, because this would // make us bitcast between two vectors which are legalized in different ways. if (NOutVT.bitsEq(NInVT) && !NOutVT.isVector()) return DAG.getNode(ISD::BITCAST, dl, NOutVT, GetWidenedVector(InOp)); } return DAG.getNode(ISD::ANY_EXTEND, dl, NOutVT, CreateStackStoreLoad(InOp, OutVT)); } SDValue DAGTypeLegalizer::PromoteIntRes_BSWAP(SDNode *N) { SDValue Op = GetPromotedInteger(N->getOperand(0)); EVT OVT = N->getValueType(0); EVT NVT = Op.getValueType(); SDLoc dl(N); unsigned DiffBits = NVT.getScalarSizeInBits() - OVT.getScalarSizeInBits(); return DAG.getNode( ISD::SRL, dl, NVT, DAG.getNode(ISD::BSWAP, dl, NVT, Op), DAG.getConstant(DiffBits, dl, TLI.getShiftAmountTy(NVT, DAG.getDataLayout()))); } SDValue DAGTypeLegalizer::PromoteIntRes_BITREVERSE(SDNode *N) { SDValue Op = GetPromotedInteger(N->getOperand(0)); EVT OVT = N->getValueType(0); EVT NVT = Op.getValueType(); SDLoc dl(N); unsigned DiffBits = NVT.getScalarSizeInBits() - OVT.getScalarSizeInBits(); return DAG.getNode( ISD::SRL, dl, NVT, DAG.getNode(ISD::BITREVERSE, dl, NVT, Op), DAG.getConstant(DiffBits, dl, TLI.getShiftAmountTy(NVT, DAG.getDataLayout()))); } SDValue DAGTypeLegalizer::PromoteIntRes_BUILD_PAIR(SDNode *N) { // The pair element type may be legal, or may not promote to the same type as // the result, for example i14 = BUILD_PAIR (i7, i7). Handle all cases. return DAG.getNode(ISD::ANY_EXTEND, SDLoc(N), TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0)), JoinIntegers(N->getOperand(0), N->getOperand(1))); } SDValue DAGTypeLegalizer::PromoteIntRes_Constant(SDNode *N) { EVT VT = N->getValueType(0); // FIXME there is no actual debug info here SDLoc dl(N); // Zero extend things like i1, sign extend everything else. It shouldn't // matter in theory which one we pick, but this tends to give better code? unsigned Opc = VT.isByteSized() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; SDValue Result = DAG.getNode(Opc, dl, TLI.getTypeToTransformTo(*DAG.getContext(), VT), SDValue(N, 0)); assert(isa(Result) && "Didn't constant fold ext?"); return Result; } SDValue DAGTypeLegalizer::PromoteIntRes_CONVERT_RNDSAT(SDNode *N) { ISD::CvtCode CvtCode = cast(N)->getCvtCode(); assert ((CvtCode == ISD::CVT_SS || CvtCode == ISD::CVT_SU || CvtCode == ISD::CVT_US || CvtCode == ISD::CVT_UU || CvtCode == ISD::CVT_SF || CvtCode == ISD::CVT_UF) && "can only promote integers"); EVT OutVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0)); return DAG.getConvertRndSat(OutVT, SDLoc(N), N->getOperand(0), N->getOperand(1), N->getOperand(2), N->getOperand(3), N->getOperand(4), CvtCode); } SDValue DAGTypeLegalizer::PromoteIntRes_CTLZ(SDNode *N) { // Zero extend to the promoted type and do the count there. SDValue Op = ZExtPromotedInteger(N->getOperand(0)); SDLoc dl(N); EVT OVT = N->getValueType(0); EVT NVT = Op.getValueType(); Op = DAG.getNode(N->getOpcode(), dl, NVT, Op); // Subtract off the extra leading bits in the bigger type. return DAG.getNode( ISD::SUB, dl, NVT, Op, DAG.getConstant(NVT.getScalarSizeInBits() - OVT.getScalarSizeInBits(), dl, NVT)); } SDValue DAGTypeLegalizer::PromoteIntRes_CTPOP(SDNode *N) { // Zero extend to the promoted type and do the count there. SDValue Op = ZExtPromotedInteger(N->getOperand(0)); return DAG.getNode(ISD::CTPOP, SDLoc(N), Op.getValueType(), Op); } SDValue DAGTypeLegalizer::PromoteIntRes_CTTZ(SDNode *N) { SDValue Op = GetPromotedInteger(N->getOperand(0)); EVT OVT = N->getValueType(0); EVT NVT = Op.getValueType(); SDLoc dl(N); if (N->getOpcode() == ISD::CTTZ) { // The count is the same in the promoted type except if the original // value was zero. This can be handled by setting the bit just off // the top of the original type. auto TopBit = APInt::getOneBitSet(NVT.getScalarSizeInBits(), OVT.getScalarSizeInBits()); Op = DAG.getNode(ISD::OR, dl, NVT, Op, DAG.getConstant(TopBit, dl, NVT)); } return DAG.getNode(N->getOpcode(), dl, NVT, Op); } SDValue DAGTypeLegalizer::PromoteIntRes_EXTRACT_VECTOR_ELT(SDNode *N) { SDLoc dl(N); EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0)); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, NVT, N->getOperand(0), N->getOperand(1)); } SDValue DAGTypeLegalizer::PromoteIntRes_FP_TO_XINT(SDNode *N) { EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0)); unsigned NewOpc = N->getOpcode(); SDLoc dl(N); // If we're promoting a UINT to a larger size and the larger FP_TO_UINT is // not Legal, check to see if we can use FP_TO_SINT instead. (If both UINT // and SINT conversions are Custom, there is no way to tell which is // preferable. We choose SINT because that's the right thing on PPC.) if (N->getOpcode() == ISD::FP_TO_UINT && !TLI.isOperationLegal(ISD::FP_TO_UINT, NVT) && TLI.isOperationLegalOrCustom(ISD::FP_TO_SINT, NVT)) NewOpc = ISD::FP_TO_SINT; SDValue Res = DAG.getNode(NewOpc, dl, NVT, N->getOperand(0)); // Assert that the converted value fits in the original type. If it doesn't // (eg: because the value being converted is too big), then the result of the // original operation was undefined anyway, so the assert is still correct. return DAG.getNode(N->getOpcode() == ISD::FP_TO_UINT ? ISD::AssertZext : ISD::AssertSext, dl, NVT, Res, DAG.getValueType(N->getValueType(0).getScalarType())); } SDValue DAGTypeLegalizer::PromoteIntRes_FP_TO_FP16(SDNode *N) { EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0)); SDLoc dl(N); return DAG.getNode(N->getOpcode(), dl, NVT, N->getOperand(0)); } SDValue DAGTypeLegalizer::PromoteIntRes_INT_EXTEND(SDNode *N) { EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0)); SDLoc dl(N); if (getTypeAction(N->getOperand(0).getValueType()) == TargetLowering::TypePromoteInteger) { SDValue Res = GetPromotedInteger(N->getOperand(0)); assert(Res.getValueType().bitsLE(NVT) && "Extension doesn't make sense!"); // If the result and operand types are the same after promotion, simplify // to an in-register extension. if (NVT == Res.getValueType()) { // The high bits are not guaranteed to be anything. Insert an extend. if (N->getOpcode() == ISD::SIGN_EXTEND) return DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, NVT, Res, DAG.getValueType(N->getOperand(0).getValueType())); if (N->getOpcode() == ISD::ZERO_EXTEND) return DAG.getZeroExtendInReg(Res, dl, N->getOperand(0).getValueType().getScalarType()); assert(N->getOpcode() == ISD::ANY_EXTEND && "Unknown integer extension!"); return Res; } } // Otherwise, just extend the original operand all the way to the larger type. return DAG.getNode(N->getOpcode(), dl, NVT, N->getOperand(0)); } SDValue DAGTypeLegalizer::PromoteIntRes_LOAD(LoadSDNode *N) { assert(ISD::isUNINDEXEDLoad(N) && "Indexed load during type legalization!"); EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0)); ISD::LoadExtType ExtType = ISD::isNON_EXTLoad(N) ? ISD::EXTLOAD : N->getExtensionType(); SDLoc dl(N); SDValue Res = DAG.getExtLoad(ExtType, dl, NVT, N->getChain(), N->getBasePtr(), N->getMemoryVT(), N->getMemOperand()); // Legalize the chain result - switch anything that used the old chain to // use the new one. ReplaceValueWith(SDValue(N, 1), Res.getValue(1)); return Res; } SDValue DAGTypeLegalizer::PromoteIntRes_MLOAD(MaskedLoadSDNode *N) { EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0)); SDValue ExtSrc0 = GetPromotedInteger(N->getSrc0()); SDLoc dl(N); SDValue Res = DAG.getMaskedLoad(NVT, dl, N->getChain(), N->getBasePtr(), N->getMask(), ExtSrc0, N->getMemoryVT(), N->getMemOperand(), ISD::SEXTLOAD); // Legalize the chain result - switch anything that used the old chain to // use the new one. ReplaceValueWith(SDValue(N, 1), Res.getValue(1)); return Res; } SDValue DAGTypeLegalizer::PromoteIntRes_MGATHER(MaskedGatherSDNode *N) { EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0)); SDValue ExtSrc0 = GetPromotedInteger(N->getValue()); assert(NVT == ExtSrc0.getValueType() && "Gather result type and the passThru agrument type should be the same"); SDLoc dl(N); SDValue Ops[] = {N->getChain(), ExtSrc0, N->getMask(), N->getBasePtr(), N->getIndex()}; SDValue Res = DAG.getMaskedGather(DAG.getVTList(NVT, MVT::Other), N->getMemoryVT(), dl, Ops, N->getMemOperand()); // Legalize the chain result - switch anything that used the old chain to // use the new one. ReplaceValueWith(SDValue(N, 1), Res.getValue(1)); return Res; } /// Promote the overflow flag of an overflowing arithmetic node. SDValue DAGTypeLegalizer::PromoteIntRes_Overflow(SDNode *N) { // Simply change the return type of the boolean result. EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(1)); EVT ValueVTs[] = { N->getValueType(0), NVT }; SDValue Ops[] = { N->getOperand(0), N->getOperand(1) }; SDValue Res = DAG.getNode(N->getOpcode(), SDLoc(N), DAG.getVTList(ValueVTs), Ops); // Modified the sum result - switch anything that used the old sum to use // the new one. ReplaceValueWith(SDValue(N, 0), Res); return SDValue(Res.getNode(), 1); } SDValue DAGTypeLegalizer::PromoteIntRes_SADDSUBO(SDNode *N, unsigned ResNo) { if (ResNo == 1) return PromoteIntRes_Overflow(N); // The operation overflowed iff the result in the larger type is not the // sign extension of its truncation to the original type. SDValue LHS = SExtPromotedInteger(N->getOperand(0)); SDValue RHS = SExtPromotedInteger(N->getOperand(1)); EVT OVT = N->getOperand(0).getValueType(); EVT NVT = LHS.getValueType(); SDLoc dl(N); // Do the arithmetic in the larger type. unsigned Opcode = N->getOpcode() == ISD::SADDO ? ISD::ADD : ISD::SUB; SDValue Res = DAG.getNode(Opcode, dl, NVT, LHS, RHS); // Calculate the overflow flag: sign extend the arithmetic result from // the original type. SDValue Ofl = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, NVT, Res, DAG.getValueType(OVT)); // Overflowed if and only if this is not equal to Res. Ofl = DAG.getSetCC(dl, N->getValueType(1), Ofl, Res, ISD::SETNE); // Use the calculated overflow everywhere. ReplaceValueWith(SDValue(N, 1), Ofl); return Res; } SDValue DAGTypeLegalizer::PromoteIntRes_SELECT(SDNode *N) { SDValue LHS = GetPromotedInteger(N->getOperand(1)); SDValue RHS = GetPromotedInteger(N->getOperand(2)); return DAG.getSelect(SDLoc(N), LHS.getValueType(), N->getOperand(0), LHS, RHS); } SDValue DAGTypeLegalizer::PromoteIntRes_VSELECT(SDNode *N) { SDValue Mask = N->getOperand(0); EVT OpTy = N->getOperand(1).getValueType(); // Promote all the way up to the canonical SetCC type. Mask = PromoteTargetBoolean(Mask, OpTy); SDValue LHS = GetPromotedInteger(N->getOperand(1)); SDValue RHS = GetPromotedInteger(N->getOperand(2)); return DAG.getNode(ISD::VSELECT, SDLoc(N), LHS.getValueType(), Mask, LHS, RHS); } SDValue DAGTypeLegalizer::PromoteIntRes_SELECT_CC(SDNode *N) { SDValue LHS = GetPromotedInteger(N->getOperand(2)); SDValue RHS = GetPromotedInteger(N->getOperand(3)); return DAG.getNode(ISD::SELECT_CC, SDLoc(N), LHS.getValueType(), N->getOperand(0), N->getOperand(1), LHS, RHS, N->getOperand(4)); } SDValue DAGTypeLegalizer::PromoteIntRes_SETCC(SDNode *N) { EVT SVT = getSetCCResultType(N->getOperand(0).getValueType()); EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0)); // Only use the result of getSetCCResultType if it is legal, // otherwise just use the promoted result type (NVT). if (!TLI.isTypeLegal(SVT)) SVT = NVT; SDLoc dl(N); assert(SVT.isVector() == N->getOperand(0).getValueType().isVector() && "Vector compare must return a vector result!"); SDValue LHS = N->getOperand(0); SDValue RHS = N->getOperand(1); if (LHS.getValueType() != RHS.getValueType()) { if (getTypeAction(LHS.getValueType()) == TargetLowering::TypePromoteInteger && !LHS.getValueType().isVector()) LHS = GetPromotedInteger(LHS); if (getTypeAction(RHS.getValueType()) == TargetLowering::TypePromoteInteger && !RHS.getValueType().isVector()) RHS = GetPromotedInteger(RHS); } // Get the SETCC result using the canonical SETCC type. SDValue SetCC = DAG.getNode(N->getOpcode(), dl, SVT, LHS, RHS, N->getOperand(2)); assert(NVT.bitsLE(SVT) && "Integer type overpromoted?"); // Convert to the expected type. return DAG.getNode(ISD::TRUNCATE, dl, NVT, SetCC); } SDValue DAGTypeLegalizer::PromoteIntRes_SHL(SDNode *N) { SDValue LHS = N->getOperand(0); SDValue RHS = N->getOperand(1); if (getTypeAction(LHS.getValueType()) == TargetLowering::TypePromoteInteger) LHS = GetPromotedInteger(LHS); if (getTypeAction(RHS.getValueType()) == TargetLowering::TypePromoteInteger) RHS = ZExtPromotedInteger(RHS); return DAG.getNode(ISD::SHL, SDLoc(N), LHS.getValueType(), LHS, RHS); } SDValue DAGTypeLegalizer::PromoteIntRes_SIGN_EXTEND_INREG(SDNode *N) { SDValue Op = GetPromotedInteger(N->getOperand(0)); return DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(N), Op.getValueType(), Op, N->getOperand(1)); } SDValue DAGTypeLegalizer::PromoteIntRes_SimpleIntBinOp(SDNode *N) { // The input may have strange things in the top bits of the registers, but // these operations don't care. They may have weird bits going out, but // that too is okay if they are integer operations. SDValue LHS = GetPromotedInteger(N->getOperand(0)); SDValue RHS = GetPromotedInteger(N->getOperand(1)); return DAG.getNode(N->getOpcode(), SDLoc(N), LHS.getValueType(), LHS, RHS); } SDValue DAGTypeLegalizer::PromoteIntRes_SExtIntBinOp(SDNode *N) { // Sign extend the input. SDValue LHS = SExtPromotedInteger(N->getOperand(0)); SDValue RHS = SExtPromotedInteger(N->getOperand(1)); return DAG.getNode(N->getOpcode(), SDLoc(N), LHS.getValueType(), LHS, RHS); } SDValue DAGTypeLegalizer::PromoteIntRes_ZExtIntBinOp(SDNode *N) { // Zero extend the input. SDValue LHS = ZExtPromotedInteger(N->getOperand(0)); SDValue RHS = ZExtPromotedInteger(N->getOperand(1)); return DAG.getNode(N->getOpcode(), SDLoc(N), LHS.getValueType(), LHS, RHS); } SDValue DAGTypeLegalizer::PromoteIntRes_SRA(SDNode *N) { SDValue LHS = N->getOperand(0); SDValue RHS = N->getOperand(1); // The input value must be properly sign extended. if (getTypeAction(LHS.getValueType()) == TargetLowering::TypePromoteInteger) LHS = SExtPromotedInteger(LHS); if (getTypeAction(RHS.getValueType()) == TargetLowering::TypePromoteInteger) RHS = ZExtPromotedInteger(RHS); return DAG.getNode(ISD::SRA, SDLoc(N), LHS.getValueType(), LHS, RHS); } SDValue DAGTypeLegalizer::PromoteIntRes_SRL(SDNode *N) { SDValue LHS = N->getOperand(0); SDValue RHS = N->getOperand(1); // The input value must be properly zero extended. if (getTypeAction(LHS.getValueType()) == TargetLowering::TypePromoteInteger) LHS = ZExtPromotedInteger(LHS); if (getTypeAction(RHS.getValueType()) == TargetLowering::TypePromoteInteger) RHS = ZExtPromotedInteger(RHS); return DAG.getNode(ISD::SRL, SDLoc(N), LHS.getValueType(), LHS, RHS); } SDValue DAGTypeLegalizer::PromoteIntRes_TRUNCATE(SDNode *N) { EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0)); SDValue Res; SDValue InOp = N->getOperand(0); SDLoc dl(N); switch (getTypeAction(InOp.getValueType())) { default: llvm_unreachable("Unknown type action!"); case TargetLowering::TypeLegal: case TargetLowering::TypeExpandInteger: Res = InOp; break; case TargetLowering::TypePromoteInteger: Res = GetPromotedInteger(InOp); break; case TargetLowering::TypeSplitVector: EVT InVT = InOp.getValueType(); assert(InVT.isVector() && "Cannot split scalar types"); unsigned NumElts = InVT.getVectorNumElements(); assert(NumElts == NVT.getVectorNumElements() && "Dst and Src must have the same number of elements"); assert(isPowerOf2_32(NumElts) && "Promoted vector type must be a power of two"); SDValue EOp1, EOp2; GetSplitVector(InOp, EOp1, EOp2); EVT HalfNVT = EVT::getVectorVT(*DAG.getContext(), NVT.getScalarType(), NumElts/2); EOp1 = DAG.getNode(ISD::TRUNCATE, dl, HalfNVT, EOp1); EOp2 = DAG.getNode(ISD::TRUNCATE, dl, HalfNVT, EOp2); return DAG.getNode(ISD::CONCAT_VECTORS, dl, NVT, EOp1, EOp2); } // Truncate to NVT instead of VT return DAG.getNode(ISD::TRUNCATE, dl, NVT, Res); } SDValue DAGTypeLegalizer::PromoteIntRes_UADDSUBO(SDNode *N, unsigned ResNo) { if (ResNo == 1) return PromoteIntRes_Overflow(N); // The operation overflowed iff the result in the larger type is not the // zero extension of its truncation to the original type. SDValue LHS = ZExtPromotedInteger(N->getOperand(0)); SDValue RHS = ZExtPromotedInteger(N->getOperand(1)); EVT OVT = N->getOperand(0).getValueType(); EVT NVT = LHS.getValueType(); SDLoc dl(N); // Do the arithmetic in the larger type. unsigned Opcode = N->getOpcode() == ISD::UADDO ? ISD::ADD : ISD::SUB; SDValue Res = DAG.getNode(Opcode, dl, NVT, LHS, RHS); // Calculate the overflow flag: zero extend the arithmetic result from // the original type. SDValue Ofl = DAG.getZeroExtendInReg(Res, dl, OVT); // Overflowed if and only if this is not equal to Res. Ofl = DAG.getSetCC(dl, N->getValueType(1), Ofl, Res, ISD::SETNE); // Use the calculated overflow everywhere. ReplaceValueWith(SDValue(N, 1), Ofl); return Res; } SDValue DAGTypeLegalizer::PromoteIntRes_XMULO(SDNode *N, unsigned ResNo) { // Promote the overflow bit trivially. if (ResNo == 1) return PromoteIntRes_Overflow(N); SDValue LHS = N->getOperand(0), RHS = N->getOperand(1); SDLoc DL(N); EVT SmallVT = LHS.getValueType(); // To determine if the result overflowed in a larger type, we extend the // input to the larger type, do the multiply (checking if it overflows), // then also check the high bits of the result to see if overflow happened // there. if (N->getOpcode() == ISD::SMULO) { LHS = SExtPromotedInteger(LHS); RHS = SExtPromotedInteger(RHS); } else { LHS = ZExtPromotedInteger(LHS); RHS = ZExtPromotedInteger(RHS); } SDVTList VTs = DAG.getVTList(LHS.getValueType(), N->getValueType(1)); SDValue Mul = DAG.getNode(N->getOpcode(), DL, VTs, LHS, RHS); // Overflow occurred if it occurred in the larger type, or if the high part // of the result does not zero/sign-extend the low part. Check this second // possibility first. SDValue Overflow; if (N->getOpcode() == ISD::UMULO) { // Unsigned overflow occurred if the high part is non-zero. SDValue Hi = DAG.getNode(ISD::SRL, DL, Mul.getValueType(), Mul, DAG.getIntPtrConstant(SmallVT.getSizeInBits(), DL)); Overflow = DAG.getSetCC(DL, N->getValueType(1), Hi, DAG.getConstant(0, DL, Hi.getValueType()), ISD::SETNE); } else { // Signed overflow occurred if the high part does not sign extend the low. SDValue SExt = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, Mul.getValueType(), Mul, DAG.getValueType(SmallVT)); Overflow = DAG.getSetCC(DL, N->getValueType(1), SExt, Mul, ISD::SETNE); } // The only other way for overflow to occur is if the multiplication in the // larger type itself overflowed. Overflow = DAG.getNode(ISD::OR, DL, N->getValueType(1), Overflow, SDValue(Mul.getNode(), 1)); // Use the calculated overflow everywhere. ReplaceValueWith(SDValue(N, 1), Overflow); return Mul; } SDValue DAGTypeLegalizer::PromoteIntRes_UNDEF(SDNode *N) { return DAG.getUNDEF(TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0))); } SDValue DAGTypeLegalizer::PromoteIntRes_VAARG(SDNode *N) { SDValue Chain = N->getOperand(0); // Get the chain. SDValue Ptr = N->getOperand(1); // Get the pointer. EVT VT = N->getValueType(0); SDLoc dl(N); MVT RegVT = TLI.getRegisterType(*DAG.getContext(), VT); unsigned NumRegs = TLI.getNumRegisters(*DAG.getContext(), VT); // The argument is passed as NumRegs registers of type RegVT. SmallVector Parts(NumRegs); for (unsigned i = 0; i < NumRegs; ++i) { Parts[i] = DAG.getVAArg(RegVT, dl, Chain, Ptr, N->getOperand(2), N->getConstantOperandVal(3)); Chain = Parts[i].getValue(1); } // Handle endianness of the load. if (DAG.getDataLayout().isBigEndian()) std::reverse(Parts.begin(), Parts.end()); // Assemble the parts in the promoted type. EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0)); SDValue Res = DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, Parts[0]); for (unsigned i = 1; i < NumRegs; ++i) { SDValue Part = DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, Parts[i]); // Shift it to the right position and "or" it in. Part = DAG.getNode(ISD::SHL, dl, NVT, Part, DAG.getConstant(i * RegVT.getSizeInBits(), dl, TLI.getPointerTy(DAG.getDataLayout()))); Res = DAG.getNode(ISD::OR, dl, NVT, Res, Part); } // Modified the chain result - switch anything that used the old chain to // use the new one. ReplaceValueWith(SDValue(N, 1), Chain); return Res; } //===----------------------------------------------------------------------===// // Integer Operand Promotion //===----------------------------------------------------------------------===// /// PromoteIntegerOperand - This method is called when the specified operand of /// the specified node is found to need promotion. At this point, all of the /// result types of the node are known to be legal, but other operands of the /// node may need promotion or expansion as well as the specified one. bool DAGTypeLegalizer::PromoteIntegerOperand(SDNode *N, unsigned OpNo) { DEBUG(dbgs() << "Promote integer operand: "; N->dump(&DAG); dbgs() << "\n"); SDValue Res = SDValue(); if (CustomLowerNode(N, N->getOperand(OpNo).getValueType(), false)) return false; switch (N->getOpcode()) { default: #ifndef NDEBUG dbgs() << "PromoteIntegerOperand Op #" << OpNo << ": "; N->dump(&DAG); dbgs() << "\n"; #endif llvm_unreachable("Do not know how to promote this operator's operand!"); case ISD::ANY_EXTEND: Res = PromoteIntOp_ANY_EXTEND(N); break; case ISD::ATOMIC_STORE: Res = PromoteIntOp_ATOMIC_STORE(cast(N)); break; case ISD::BITCAST: Res = PromoteIntOp_BITCAST(N); break; case ISD::BR_CC: Res = PromoteIntOp_BR_CC(N, OpNo); break; case ISD::BRCOND: Res = PromoteIntOp_BRCOND(N, OpNo); break; case ISD::BUILD_PAIR: Res = PromoteIntOp_BUILD_PAIR(N); break; case ISD::BUILD_VECTOR: Res = PromoteIntOp_BUILD_VECTOR(N); break; case ISD::CONCAT_VECTORS: Res = PromoteIntOp_CONCAT_VECTORS(N); break; case ISD::EXTRACT_VECTOR_ELT: Res = PromoteIntOp_EXTRACT_VECTOR_ELT(N); break; case ISD::CONVERT_RNDSAT: Res = PromoteIntOp_CONVERT_RNDSAT(N); break; case ISD::INSERT_VECTOR_ELT: Res = PromoteIntOp_INSERT_VECTOR_ELT(N, OpNo);break; case ISD::SCALAR_TO_VECTOR: Res = PromoteIntOp_SCALAR_TO_VECTOR(N); break; case ISD::VSELECT: case ISD::SELECT: Res = PromoteIntOp_SELECT(N, OpNo); break; case ISD::SELECT_CC: Res = PromoteIntOp_SELECT_CC(N, OpNo); break; case ISD::SETCC: Res = PromoteIntOp_SETCC(N, OpNo); break; case ISD::SIGN_EXTEND: Res = PromoteIntOp_SIGN_EXTEND(N); break; case ISD::SINT_TO_FP: Res = PromoteIntOp_SINT_TO_FP(N); break; case ISD::STORE: Res = PromoteIntOp_STORE(cast(N), OpNo); break; case ISD::MSTORE: Res = PromoteIntOp_MSTORE(cast(N), OpNo); break; case ISD::MLOAD: Res = PromoteIntOp_MLOAD(cast(N), OpNo); break; case ISD::MGATHER: Res = PromoteIntOp_MGATHER(cast(N), OpNo); break; case ISD::MSCATTER: Res = PromoteIntOp_MSCATTER(cast(N), OpNo); break; case ISD::TRUNCATE: Res = PromoteIntOp_TRUNCATE(N); break; case ISD::FP16_TO_FP: case ISD::UINT_TO_FP: Res = PromoteIntOp_UINT_TO_FP(N); break; case ISD::ZERO_EXTEND: Res = PromoteIntOp_ZERO_EXTEND(N); break; case ISD::EXTRACT_SUBVECTOR: Res = PromoteIntOp_EXTRACT_SUBVECTOR(N); break; case ISD::SHL: case ISD::SRA: case ISD::SRL: case ISD::ROTL: case ISD::ROTR: Res = PromoteIntOp_Shift(N); break; } // If the result is null, the sub-method took care of registering results etc. if (!Res.getNode()) return false; // If the result is N, the sub-method updated N in place. Tell the legalizer // core about this. if (Res.getNode() == N) return true; assert(Res.getValueType() == N->getValueType(0) && N->getNumValues() == 1 && "Invalid operand expansion"); ReplaceValueWith(SDValue(N, 0), Res); return false; } /// PromoteSetCCOperands - Promote the operands of a comparison. This code is /// shared among BR_CC, SELECT_CC, and SETCC handlers. void DAGTypeLegalizer::PromoteSetCCOperands(SDValue &NewLHS,SDValue &NewRHS, ISD::CondCode CCCode) { // We have to insert explicit sign or zero extends. Note that we could // insert sign extends for ALL conditions, but zero extend is cheaper on // many machines (an AND instead of two shifts), so prefer it. switch (CCCode) { default: llvm_unreachable("Unknown integer comparison!"); case ISD::SETEQ: case ISD::SETNE: { SDValue OpL = GetPromotedInteger(NewLHS); SDValue OpR = GetPromotedInteger(NewRHS); // We would prefer to promote the comparison operand with sign extension, // if we find the operand is actually to truncate an AssertSext. With this // optimization, we can avoid inserting real truncate instruction, which // is redudant eventually. if (OpL->getOpcode() == ISD::AssertSext && cast(OpL->getOperand(1))->getVT() == NewLHS.getValueType() && OpR->getOpcode() == ISD::AssertSext && cast(OpR->getOperand(1))->getVT() == NewRHS.getValueType()) { NewLHS = OpL; NewRHS = OpR; } else { NewLHS = ZExtPromotedInteger(NewLHS); NewRHS = ZExtPromotedInteger(NewRHS); } break; } case ISD::SETUGE: case ISD::SETUGT: case ISD::SETULE: case ISD::SETULT: // ALL of these operations will work if we either sign or zero extend // the operands (including the unsigned comparisons!). Zero extend is // usually a simpler/cheaper operation, so prefer it. NewLHS = ZExtPromotedInteger(NewLHS); NewRHS = ZExtPromotedInteger(NewRHS); break; case ISD::SETGE: case ISD::SETGT: case ISD::SETLT: case ISD::SETLE: NewLHS = SExtPromotedInteger(NewLHS); NewRHS = SExtPromotedInteger(NewRHS); break; } } SDValue DAGTypeLegalizer::PromoteIntOp_ANY_EXTEND(SDNode *N) { SDValue Op = GetPromotedInteger(N->getOperand(0)); return DAG.getNode(ISD::ANY_EXTEND, SDLoc(N), N->getValueType(0), Op); } SDValue DAGTypeLegalizer::PromoteIntOp_ATOMIC_STORE(AtomicSDNode *N) { SDValue Op2 = GetPromotedInteger(N->getOperand(2)); return DAG.getAtomic(N->getOpcode(), SDLoc(N), N->getMemoryVT(), N->getChain(), N->getBasePtr(), Op2, N->getMemOperand(), N->getOrdering(), N->getSynchScope()); } SDValue DAGTypeLegalizer::PromoteIntOp_BITCAST(SDNode *N) { // This should only occur in unusual situations like bitcasting to an // x86_fp80, so just turn it into a store+load return CreateStackStoreLoad(N->getOperand(0), N->getValueType(0)); } SDValue DAGTypeLegalizer::PromoteIntOp_BR_CC(SDNode *N, unsigned OpNo) { assert(OpNo == 2 && "Don't know how to promote this operand!"); SDValue LHS = N->getOperand(2); SDValue RHS = N->getOperand(3); PromoteSetCCOperands(LHS, RHS, cast(N->getOperand(1))->get()); // The chain (Op#0), CC (#1) and basic block destination (Op#4) are always // legal types. return SDValue(DAG.UpdateNodeOperands(N, N->getOperand(0), N->getOperand(1), LHS, RHS, N->getOperand(4)), 0); } SDValue DAGTypeLegalizer::PromoteIntOp_BRCOND(SDNode *N, unsigned OpNo) { assert(OpNo == 1 && "only know how to promote condition"); // Promote all the way up to the canonical SetCC type. SDValue Cond = PromoteTargetBoolean(N->getOperand(1), MVT::Other); // The chain (Op#0) and basic block destination (Op#2) are always legal types. return SDValue(DAG.UpdateNodeOperands(N, N->getOperand(0), Cond, N->getOperand(2)), 0); } SDValue DAGTypeLegalizer::PromoteIntOp_BUILD_PAIR(SDNode *N) { // Since the result type is legal, the operands must promote to it. EVT OVT = N->getOperand(0).getValueType(); SDValue Lo = ZExtPromotedInteger(N->getOperand(0)); SDValue Hi = GetPromotedInteger(N->getOperand(1)); assert(Lo.getValueType() == N->getValueType(0) && "Operand over promoted?"); SDLoc dl(N); Hi = DAG.getNode(ISD::SHL, dl, N->getValueType(0), Hi, DAG.getConstant(OVT.getSizeInBits(), dl, TLI.getPointerTy(DAG.getDataLayout()))); return DAG.getNode(ISD::OR, dl, N->getValueType(0), Lo, Hi); } SDValue DAGTypeLegalizer::PromoteIntOp_BUILD_VECTOR(SDNode *N) { // The vector type is legal but the element type is not. This implies // that the vector is a power-of-two in length and that the element // type does not have a strange size (eg: it is not i1). EVT VecVT = N->getValueType(0); unsigned NumElts = VecVT.getVectorNumElements(); assert(!((NumElts & 1) && (!TLI.isTypeLegal(VecVT))) && "Legal vector of one illegal element?"); // Promote the inserted value. The type does not need to match the // vector element type. Check that any extra bits introduced will be // truncated away. assert(N->getOperand(0).getValueType().getSizeInBits() >= N->getValueType(0).getVectorElementType().getSizeInBits() && "Type of inserted value narrower than vector element type!"); SmallVector NewOps; for (unsigned i = 0; i < NumElts; ++i) NewOps.push_back(GetPromotedInteger(N->getOperand(i))); return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0); } SDValue DAGTypeLegalizer::PromoteIntOp_CONVERT_RNDSAT(SDNode *N) { ISD::CvtCode CvtCode = cast(N)->getCvtCode(); assert ((CvtCode == ISD::CVT_SS || CvtCode == ISD::CVT_SU || CvtCode == ISD::CVT_US || CvtCode == ISD::CVT_UU || CvtCode == ISD::CVT_FS || CvtCode == ISD::CVT_FU) && "can only promote integer arguments"); SDValue InOp = GetPromotedInteger(N->getOperand(0)); return DAG.getConvertRndSat(N->getValueType(0), SDLoc(N), InOp, N->getOperand(1), N->getOperand(2), N->getOperand(3), N->getOperand(4), CvtCode); } SDValue DAGTypeLegalizer::PromoteIntOp_INSERT_VECTOR_ELT(SDNode *N, unsigned OpNo) { if (OpNo == 1) { // Promote the inserted value. This is valid because the type does not // have to match the vector element type. // Check that any extra bits introduced will be truncated away. assert(N->getOperand(1).getValueType().getSizeInBits() >= N->getValueType(0).getVectorElementType().getSizeInBits() && "Type of inserted value narrower than vector element type!"); return SDValue(DAG.UpdateNodeOperands(N, N->getOperand(0), GetPromotedInteger(N->getOperand(1)), N->getOperand(2)), 0); } assert(OpNo == 2 && "Different operand and result vector types?"); // Promote the index. SDValue Idx = DAG.getZExtOrTrunc(N->getOperand(2), SDLoc(N), TLI.getVectorIdxTy(DAG.getDataLayout())); return SDValue(DAG.UpdateNodeOperands(N, N->getOperand(0), N->getOperand(1), Idx), 0); } SDValue DAGTypeLegalizer::PromoteIntOp_SCALAR_TO_VECTOR(SDNode *N) { // Integer SCALAR_TO_VECTOR operands are implicitly truncated, so just promote // the operand in place. return SDValue(DAG.UpdateNodeOperands(N, GetPromotedInteger(N->getOperand(0))), 0); } SDValue DAGTypeLegalizer::PromoteIntOp_SELECT(SDNode *N, unsigned OpNo) { assert(OpNo == 0 && "Only know how to promote the condition!"); SDValue Cond = N->getOperand(0); EVT OpTy = N->getOperand(1).getValueType(); // Promote all the way up to the canonical SetCC type. EVT OpVT = N->getOpcode() == ISD::SELECT ? OpTy.getScalarType() : OpTy; Cond = PromoteTargetBoolean(Cond, OpVT); return SDValue(DAG.UpdateNodeOperands(N, Cond, N->getOperand(1), N->getOperand(2)), 0); } SDValue DAGTypeLegalizer::PromoteIntOp_SELECT_CC(SDNode *N, unsigned OpNo) { assert(OpNo == 0 && "Don't know how to promote this operand!"); SDValue LHS = N->getOperand(0); SDValue RHS = N->getOperand(1); PromoteSetCCOperands(LHS, RHS, cast(N->getOperand(4))->get()); // The CC (#4) and the possible return values (#2 and #3) have legal types. return SDValue(DAG.UpdateNodeOperands(N, LHS, RHS, N->getOperand(2), N->getOperand(3), N->getOperand(4)), 0); } SDValue DAGTypeLegalizer::PromoteIntOp_SETCC(SDNode *N, unsigned OpNo) { assert(OpNo == 0 && "Don't know how to promote this operand!"); SDValue LHS = N->getOperand(0); SDValue RHS = N->getOperand(1); PromoteSetCCOperands(LHS, RHS, cast(N->getOperand(2))->get()); // The CC (#2) is always legal. return SDValue(DAG.UpdateNodeOperands(N, LHS, RHS, N->getOperand(2)), 0); } SDValue DAGTypeLegalizer::PromoteIntOp_Shift(SDNode *N) { return SDValue(DAG.UpdateNodeOperands(N, N->getOperand(0), ZExtPromotedInteger(N->getOperand(1))), 0); } SDValue DAGTypeLegalizer::PromoteIntOp_SIGN_EXTEND(SDNode *N) { SDValue Op = GetPromotedInteger(N->getOperand(0)); SDLoc dl(N); Op = DAG.getNode(ISD::ANY_EXTEND, dl, N->getValueType(0), Op); return DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, Op.getValueType(), Op, DAG.getValueType(N->getOperand(0).getValueType())); } SDValue DAGTypeLegalizer::PromoteIntOp_SINT_TO_FP(SDNode *N) { return SDValue(DAG.UpdateNodeOperands(N, SExtPromotedInteger(N->getOperand(0))), 0); } SDValue DAGTypeLegalizer::PromoteIntOp_STORE(StoreSDNode *N, unsigned OpNo){ assert(ISD::isUNINDEXEDStore(N) && "Indexed store during type legalization!"); SDValue Ch = N->getChain(), Ptr = N->getBasePtr(); SDLoc dl(N); SDValue Val = GetPromotedInteger(N->getValue()); // Get promoted value. // Truncate the value and store the result. return DAG.getTruncStore(Ch, dl, Val, Ptr, N->getMemoryVT(), N->getMemOperand()); } SDValue DAGTypeLegalizer::PromoteIntOp_MSTORE(MaskedStoreSDNode *N, unsigned OpNo) { SDValue DataOp = N->getValue(); EVT DataVT = DataOp.getValueType(); SDValue Mask = N->getMask(); SDLoc dl(N); bool TruncateStore = false; if (OpNo == 2) { // Mask comes before the data operand. If the data operand is legal, we just // promote the mask. // When the data operand has illegal type, we should legalize the data // operand first. The mask will be promoted/splitted/widened according to // the data operand type. if (TLI.isTypeLegal(DataVT)) Mask = PromoteTargetBoolean(Mask, DataVT); else { if (getTypeAction(DataVT) == TargetLowering::TypePromoteInteger) return PromoteIntOp_MSTORE(N, 3); else if (getTypeAction(DataVT) == TargetLowering::TypeWidenVector) return WidenVecOp_MSTORE(N, 3); else { assert (getTypeAction(DataVT) == TargetLowering::TypeSplitVector); return SplitVecOp_MSTORE(N, 3); } } } else { // Data operand assert(OpNo == 3 && "Unexpected operand for promotion"); DataOp = GetPromotedInteger(DataOp); Mask = PromoteTargetBoolean(Mask, DataOp.getValueType()); TruncateStore = true; } return DAG.getMaskedStore(N->getChain(), dl, DataOp, N->getBasePtr(), Mask, N->getMemoryVT(), N->getMemOperand(), TruncateStore); } SDValue DAGTypeLegalizer::PromoteIntOp_MLOAD(MaskedLoadSDNode *N, unsigned OpNo) { assert(OpNo == 2 && "Only know how to promote the mask!"); EVT DataVT = N->getValueType(0); SDValue Mask = PromoteTargetBoolean(N->getOperand(OpNo), DataVT); SmallVector NewOps(N->op_begin(), N->op_end()); NewOps[OpNo] = Mask; return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0); } SDValue DAGTypeLegalizer::PromoteIntOp_MGATHER(MaskedGatherSDNode *N, unsigned OpNo) { SmallVector NewOps(N->op_begin(), N->op_end()); if (OpNo == 2) { // The Mask EVT DataVT = N->getValueType(0); NewOps[OpNo] = PromoteTargetBoolean(N->getOperand(OpNo), DataVT); } else NewOps[OpNo] = GetPromotedInteger(N->getOperand(OpNo)); return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0); } SDValue DAGTypeLegalizer::PromoteIntOp_MSCATTER(MaskedScatterSDNode *N, unsigned OpNo) { SmallVector NewOps(N->op_begin(), N->op_end()); if (OpNo == 2) { // The Mask EVT DataVT = N->getValue().getValueType(); NewOps[OpNo] = PromoteTargetBoolean(N->getOperand(OpNo), DataVT); } else NewOps[OpNo] = GetPromotedInteger(N->getOperand(OpNo)); return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0); } SDValue DAGTypeLegalizer::PromoteIntOp_TRUNCATE(SDNode *N) { SDValue Op = GetPromotedInteger(N->getOperand(0)); return DAG.getNode(ISD::TRUNCATE, SDLoc(N), N->getValueType(0), Op); } SDValue DAGTypeLegalizer::PromoteIntOp_UINT_TO_FP(SDNode *N) { return SDValue(DAG.UpdateNodeOperands(N, ZExtPromotedInteger(N->getOperand(0))), 0); } SDValue DAGTypeLegalizer::PromoteIntOp_ZERO_EXTEND(SDNode *N) { SDLoc dl(N); SDValue Op = GetPromotedInteger(N->getOperand(0)); Op = DAG.getNode(ISD::ANY_EXTEND, dl, N->getValueType(0), Op); return DAG.getZeroExtendInReg(Op, dl, N->getOperand(0).getValueType().getScalarType()); } //===----------------------------------------------------------------------===// // Integer Result Expansion //===----------------------------------------------------------------------===// /// ExpandIntegerResult - This method is called when the specified result of the /// specified node is found to need expansion. At this point, the node may also /// have invalid operands or may have other results that need promotion, we just /// know that (at least) one result needs expansion. void DAGTypeLegalizer::ExpandIntegerResult(SDNode *N, unsigned ResNo) { DEBUG(dbgs() << "Expand integer result: "; N->dump(&DAG); dbgs() << "\n"); SDValue Lo, Hi; Lo = Hi = SDValue(); // See if the target wants to custom expand this node. if (CustomLowerNode(N, N->getValueType(ResNo), true)) return; switch (N->getOpcode()) { default: #ifndef NDEBUG dbgs() << "ExpandIntegerResult #" << ResNo << ": "; N->dump(&DAG); dbgs() << "\n"; #endif llvm_unreachable("Do not know how to expand the result of this operator!"); case ISD::MERGE_VALUES: SplitRes_MERGE_VALUES(N, ResNo, Lo, Hi); break; case ISD::SELECT: SplitRes_SELECT(N, Lo, Hi); break; case ISD::SELECT_CC: SplitRes_SELECT_CC(N, Lo, Hi); break; case ISD::UNDEF: SplitRes_UNDEF(N, Lo, Hi); break; case ISD::BITCAST: ExpandRes_BITCAST(N, Lo, Hi); break; case ISD::BUILD_PAIR: ExpandRes_BUILD_PAIR(N, Lo, Hi); break; case ISD::EXTRACT_ELEMENT: ExpandRes_EXTRACT_ELEMENT(N, Lo, Hi); break; case ISD::EXTRACT_VECTOR_ELT: ExpandRes_EXTRACT_VECTOR_ELT(N, Lo, Hi); break; case ISD::VAARG: ExpandRes_VAARG(N, Lo, Hi); break; case ISD::ANY_EXTEND: ExpandIntRes_ANY_EXTEND(N, Lo, Hi); break; case ISD::AssertSext: ExpandIntRes_AssertSext(N, Lo, Hi); break; case ISD::AssertZext: ExpandIntRes_AssertZext(N, Lo, Hi); break; case ISD::BITREVERSE: ExpandIntRes_BITREVERSE(N, Lo, Hi); break; case ISD::BSWAP: ExpandIntRes_BSWAP(N, Lo, Hi); break; case ISD::Constant: ExpandIntRes_Constant(N, Lo, Hi); break; case ISD::CTLZ_ZERO_UNDEF: case ISD::CTLZ: ExpandIntRes_CTLZ(N, Lo, Hi); break; case ISD::CTPOP: ExpandIntRes_CTPOP(N, Lo, Hi); break; case ISD::CTTZ_ZERO_UNDEF: case ISD::CTTZ: ExpandIntRes_CTTZ(N, Lo, Hi); break; case ISD::FP_TO_SINT: ExpandIntRes_FP_TO_SINT(N, Lo, Hi); break; case ISD::FP_TO_UINT: ExpandIntRes_FP_TO_UINT(N, Lo, Hi); break; case ISD::LOAD: ExpandIntRes_LOAD(cast(N), Lo, Hi); break; case ISD::MUL: ExpandIntRes_MUL(N, Lo, Hi); break; case ISD::READCYCLECOUNTER: ExpandIntRes_READCYCLECOUNTER(N, Lo, Hi); break; case ISD::SDIV: ExpandIntRes_SDIV(N, Lo, Hi); break; case ISD::SIGN_EXTEND: ExpandIntRes_SIGN_EXTEND(N, Lo, Hi); break; case ISD::SIGN_EXTEND_INREG: ExpandIntRes_SIGN_EXTEND_INREG(N, Lo, Hi); break; case ISD::SREM: ExpandIntRes_SREM(N, Lo, Hi); break; case ISD::TRUNCATE: ExpandIntRes_TRUNCATE(N, Lo, Hi); break; case ISD::UDIV: ExpandIntRes_UDIV(N, Lo, Hi); break; case ISD::UREM: ExpandIntRes_UREM(N, Lo, Hi); break; case ISD::ZERO_EXTEND: ExpandIntRes_ZERO_EXTEND(N, Lo, Hi); break; case ISD::ATOMIC_LOAD: ExpandIntRes_ATOMIC_LOAD(N, Lo, Hi); break; case ISD::ATOMIC_LOAD_ADD: case ISD::ATOMIC_LOAD_SUB: case ISD::ATOMIC_LOAD_AND: case ISD::ATOMIC_LOAD_OR: case ISD::ATOMIC_LOAD_XOR: case ISD::ATOMIC_LOAD_NAND: case ISD::ATOMIC_LOAD_MIN: case ISD::ATOMIC_LOAD_MAX: case ISD::ATOMIC_LOAD_UMIN: case ISD::ATOMIC_LOAD_UMAX: case ISD::ATOMIC_SWAP: case ISD::ATOMIC_CMP_SWAP: { std::pair Tmp = ExpandAtomic(N); SplitInteger(Tmp.first, Lo, Hi); ReplaceValueWith(SDValue(N, 1), Tmp.second); break; } case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: { AtomicSDNode *AN = cast(N); SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::Other); SDValue Tmp = DAG.getAtomicCmpSwap( ISD::ATOMIC_CMP_SWAP, SDLoc(N), AN->getMemoryVT(), VTs, N->getOperand(0), N->getOperand(1), N->getOperand(2), N->getOperand(3), AN->getMemOperand(), AN->getSuccessOrdering(), AN->getFailureOrdering(), AN->getSynchScope()); // Expanding to the strong ATOMIC_CMP_SWAP node means we can determine // success simply by comparing the loaded value against the ingoing // comparison. SDValue Success = DAG.getSetCC(SDLoc(N), N->getValueType(1), Tmp, N->getOperand(2), ISD::SETEQ); SplitInteger(Tmp, Lo, Hi); ReplaceValueWith(SDValue(N, 1), Success); ReplaceValueWith(SDValue(N, 2), Tmp.getValue(1)); break; } case ISD::AND: case ISD::OR: case ISD::XOR: ExpandIntRes_Logical(N, Lo, Hi); break; case ISD::UMAX: case ISD::SMAX: case ISD::UMIN: case ISD::SMIN: ExpandIntRes_MINMAX(N, Lo, Hi); break; case ISD::ADD: case ISD::SUB: ExpandIntRes_ADDSUB(N, Lo, Hi); break; case ISD::ADDC: case ISD::SUBC: ExpandIntRes_ADDSUBC(N, Lo, Hi); break; case ISD::ADDE: case ISD::SUBE: ExpandIntRes_ADDSUBE(N, Lo, Hi); break; case ISD::SHL: case ISD::SRA: case ISD::SRL: ExpandIntRes_Shift(N, Lo, Hi); break; case ISD::SADDO: case ISD::SSUBO: ExpandIntRes_SADDSUBO(N, Lo, Hi); break; case ISD::UADDO: case ISD::USUBO: ExpandIntRes_UADDSUBO(N, Lo, Hi); break; case ISD::UMULO: case ISD::SMULO: ExpandIntRes_XMULO(N, Lo, Hi); break; } // If Lo/Hi is null, the sub-method took care of registering results etc. if (Lo.getNode()) SetExpandedInteger(SDValue(N, ResNo), Lo, Hi); } /// Lower an atomic node to the appropriate builtin call. std::pair DAGTypeLegalizer::ExpandAtomic(SDNode *Node) { unsigned Opc = Node->getOpcode(); MVT VT = cast(Node)->getMemoryVT().getSimpleVT(); RTLIB::Libcall LC = RTLIB::getSYNC(Opc, VT); assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected atomic op or value type!"); return ExpandChainLibCall(LC, Node, false); } /// N is a shift by a value that needs to be expanded, /// and the shift amount is a constant 'Amt'. Expand the operation. void DAGTypeLegalizer::ExpandShiftByConstant(SDNode *N, const APInt &Amt, SDValue &Lo, SDValue &Hi) { SDLoc DL(N); // Expand the incoming operand to be shifted, so that we have its parts SDValue InL, InH; GetExpandedInteger(N->getOperand(0), InL, InH); // Though Amt shouldn't usually be 0, it's possible. E.g. when legalization // splitted a vector shift, like this: SHL <0, 2>. if (!Amt) { Lo = InL; Hi = InH; return; } EVT NVT = InL.getValueType(); unsigned VTBits = N->getValueType(0).getSizeInBits(); unsigned NVTBits = NVT.getSizeInBits(); EVT ShTy = N->getOperand(1).getValueType(); if (N->getOpcode() == ISD::SHL) { if (Amt.ugt(VTBits)) { Lo = Hi = DAG.getConstant(0, DL, NVT); } else if (Amt.ugt(NVTBits)) { Lo = DAG.getConstant(0, DL, NVT); Hi = DAG.getNode(ISD::SHL, DL, NVT, InL, DAG.getConstant(Amt - NVTBits, DL, ShTy)); } else if (Amt == NVTBits) { Lo = DAG.getConstant(0, DL, NVT); Hi = InL; } else { Lo = DAG.getNode(ISD::SHL, DL, NVT, InL, DAG.getConstant(Amt, DL, ShTy)); Hi = DAG.getNode(ISD::OR, DL, NVT, DAG.getNode(ISD::SHL, DL, NVT, InH, DAG.getConstant(Amt, DL, ShTy)), DAG.getNode(ISD::SRL, DL, NVT, InL, DAG.getConstant(-Amt + NVTBits, DL, ShTy))); } return; } if (N->getOpcode() == ISD::SRL) { if (Amt.ugt(VTBits)) { Lo = Hi = DAG.getConstant(0, DL, NVT); } else if (Amt.ugt(NVTBits)) { Lo = DAG.getNode(ISD::SRL, DL, NVT, InH, DAG.getConstant(Amt - NVTBits, DL, ShTy)); Hi = DAG.getConstant(0, DL, NVT); } else if (Amt == NVTBits) { Lo = InH; Hi = DAG.getConstant(0, DL, NVT); } else { Lo = DAG.getNode(ISD::OR, DL, NVT, DAG.getNode(ISD::SRL, DL, NVT, InL, DAG.getConstant(Amt, DL, ShTy)), DAG.getNode(ISD::SHL, DL, NVT, InH, DAG.getConstant(-Amt + NVTBits, DL, ShTy))); Hi = DAG.getNode(ISD::SRL, DL, NVT, InH, DAG.getConstant(Amt, DL, ShTy)); } return; } assert(N->getOpcode() == ISD::SRA && "Unknown shift!"); if (Amt.ugt(VTBits)) { Hi = Lo = DAG.getNode(ISD::SRA, DL, NVT, InH, DAG.getConstant(NVTBits - 1, DL, ShTy)); } else if (Amt.ugt(NVTBits)) { Lo = DAG.getNode(ISD::SRA, DL, NVT, InH, DAG.getConstant(Amt - NVTBits, DL, ShTy)); Hi = DAG.getNode(ISD::SRA, DL, NVT, InH, DAG.getConstant(NVTBits - 1, DL, ShTy)); } else if (Amt == NVTBits) { Lo = InH; Hi = DAG.getNode(ISD::SRA, DL, NVT, InH, DAG.getConstant(NVTBits - 1, DL, ShTy)); } else { Lo = DAG.getNode(ISD::OR, DL, NVT, DAG.getNode(ISD::SRL, DL, NVT, InL, DAG.getConstant(Amt, DL, ShTy)), DAG.getNode(ISD::SHL, DL, NVT, InH, DAG.getConstant(-Amt + NVTBits, DL, ShTy))); Hi = DAG.getNode(ISD::SRA, DL, NVT, InH, DAG.getConstant(Amt, DL, ShTy)); } } /// ExpandShiftWithKnownAmountBit - Try to determine whether we can simplify /// this shift based on knowledge of the high bit of the shift amount. If we /// can tell this, we know that it is >= 32 or < 32, without knowing the actual /// shift amount. bool DAGTypeLegalizer:: ExpandShiftWithKnownAmountBit(SDNode *N, SDValue &Lo, SDValue &Hi) { SDValue Amt = N->getOperand(1); EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0)); EVT ShTy = Amt.getValueType(); unsigned ShBits = ShTy.getScalarType().getSizeInBits(); unsigned NVTBits = NVT.getScalarType().getSizeInBits(); assert(isPowerOf2_32(NVTBits) && "Expanded integer type size not a power of two!"); SDLoc dl(N); APInt HighBitMask = APInt::getHighBitsSet(ShBits, ShBits - Log2_32(NVTBits)); APInt KnownZero, KnownOne; DAG.computeKnownBits(N->getOperand(1), KnownZero, KnownOne); // If we don't know anything about the high bits, exit. if (((KnownZero|KnownOne) & HighBitMask) == 0) return false; // Get the incoming operand to be shifted. SDValue InL, InH; GetExpandedInteger(N->getOperand(0), InL, InH); // If we know that any of the high bits of the shift amount are one, then we // can do this as a couple of simple shifts. if (KnownOne.intersects(HighBitMask)) { // Mask out the high bit, which we know is set. Amt = DAG.getNode(ISD::AND, dl, ShTy, Amt, DAG.getConstant(~HighBitMask, dl, ShTy)); switch (N->getOpcode()) { default: llvm_unreachable("Unknown shift"); case ISD::SHL: Lo = DAG.getConstant(0, dl, NVT); // Low part is zero. Hi = DAG.getNode(ISD::SHL, dl, NVT, InL, Amt); // High part from Lo part. return true; case ISD::SRL: Hi = DAG.getConstant(0, dl, NVT); // Hi part is zero. Lo = DAG.getNode(ISD::SRL, dl, NVT, InH, Amt); // Lo part from Hi part. return true; case ISD::SRA: Hi = DAG.getNode(ISD::SRA, dl, NVT, InH, // Sign extend high part. DAG.getConstant(NVTBits - 1, dl, ShTy)); Lo = DAG.getNode(ISD::SRA, dl, NVT, InH, Amt); // Lo part from Hi part. return true; } } // If we know that all of the high bits of the shift amount are zero, then we // can do this as a couple of simple shifts. if ((KnownZero & HighBitMask) == HighBitMask) { // Calculate 31-x. 31 is used instead of 32 to avoid creating an undefined // shift if x is zero. We can use XOR here because x is known to be smaller // than 32. SDValue Amt2 = DAG.getNode(ISD::XOR, dl, ShTy, Amt, DAG.getConstant(NVTBits - 1, dl, ShTy)); unsigned Op1, Op2; switch (N->getOpcode()) { default: llvm_unreachable("Unknown shift"); case ISD::SHL: Op1 = ISD::SHL; Op2 = ISD::SRL; break; case ISD::SRL: case ISD::SRA: Op1 = ISD::SRL; Op2 = ISD::SHL; break; } // When shifting right the arithmetic for Lo and Hi is swapped. if (N->getOpcode() != ISD::SHL) std::swap(InL, InH); // Use a little trick to get the bits that move from Lo to Hi. First // shift by one bit. SDValue Sh1 = DAG.getNode(Op2, dl, NVT, InL, DAG.getConstant(1, dl, ShTy)); // Then compute the remaining shift with amount-1. SDValue Sh2 = DAG.getNode(Op2, dl, NVT, Sh1, Amt2); Lo = DAG.getNode(N->getOpcode(), dl, NVT, InL, Amt); Hi = DAG.getNode(ISD::OR, dl, NVT, DAG.getNode(Op1, dl, NVT, InH, Amt),Sh2); if (N->getOpcode() != ISD::SHL) std::swap(Hi, Lo); return true; } return false; } /// ExpandShiftWithUnknownAmountBit - Fully general expansion of integer shift /// of any size. bool DAGTypeLegalizer:: ExpandShiftWithUnknownAmountBit(SDNode *N, SDValue &Lo, SDValue &Hi) { SDValue Amt = N->getOperand(1); EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0)); EVT ShTy = Amt.getValueType(); unsigned NVTBits = NVT.getSizeInBits(); assert(isPowerOf2_32(NVTBits) && "Expanded integer type size not a power of two!"); SDLoc dl(N); // Get the incoming operand to be shifted. SDValue InL, InH; GetExpandedInteger(N->getOperand(0), InL, InH); SDValue NVBitsNode = DAG.getConstant(NVTBits, dl, ShTy); SDValue AmtExcess = DAG.getNode(ISD::SUB, dl, ShTy, Amt, NVBitsNode); SDValue AmtLack = DAG.getNode(ISD::SUB, dl, ShTy, NVBitsNode, Amt); SDValue isShort = DAG.getSetCC(dl, getSetCCResultType(ShTy), Amt, NVBitsNode, ISD::SETULT); SDValue isZero = DAG.getSetCC(dl, getSetCCResultType(ShTy), Amt, DAG.getConstant(0, dl, ShTy), ISD::SETEQ); SDValue LoS, HiS, LoL, HiL; switch (N->getOpcode()) { default: llvm_unreachable("Unknown shift"); case ISD::SHL: // Short: ShAmt < NVTBits LoS = DAG.getNode(ISD::SHL, dl, NVT, InL, Amt); HiS = DAG.getNode(ISD::OR, dl, NVT, DAG.getNode(ISD::SHL, dl, NVT, InH, Amt), DAG.getNode(ISD::SRL, dl, NVT, InL, AmtLack)); // Long: ShAmt >= NVTBits LoL = DAG.getConstant(0, dl, NVT); // Lo part is zero. HiL = DAG.getNode(ISD::SHL, dl, NVT, InL, AmtExcess); // Hi from Lo part. Lo = DAG.getSelect(dl, NVT, isShort, LoS, LoL); Hi = DAG.getSelect(dl, NVT, isZero, InH, DAG.getSelect(dl, NVT, isShort, HiS, HiL)); return true; case ISD::SRL: // Short: ShAmt < NVTBits HiS = DAG.getNode(ISD::SRL, dl, NVT, InH, Amt); LoS = DAG.getNode(ISD::OR, dl, NVT, DAG.getNode(ISD::SRL, dl, NVT, InL, Amt), // FIXME: If Amt is zero, the following shift generates an undefined result // on some architectures. DAG.getNode(ISD::SHL, dl, NVT, InH, AmtLack)); // Long: ShAmt >= NVTBits HiL = DAG.getConstant(0, dl, NVT); // Hi part is zero. LoL = DAG.getNode(ISD::SRL, dl, NVT, InH, AmtExcess); // Lo from Hi part. Lo = DAG.getSelect(dl, NVT, isZero, InL, DAG.getSelect(dl, NVT, isShort, LoS, LoL)); Hi = DAG.getSelect(dl, NVT, isShort, HiS, HiL); return true; case ISD::SRA: // Short: ShAmt < NVTBits HiS = DAG.getNode(ISD::SRA, dl, NVT, InH, Amt); LoS = DAG.getNode(ISD::OR, dl, NVT, DAG.getNode(ISD::SRL, dl, NVT, InL, Amt), DAG.getNode(ISD::SHL, dl, NVT, InH, AmtLack)); // Long: ShAmt >= NVTBits HiL = DAG.getNode(ISD::SRA, dl, NVT, InH, // Sign of Hi part. DAG.getConstant(NVTBits - 1, dl, ShTy)); LoL = DAG.getNode(ISD::SRA, dl, NVT, InH, AmtExcess); // Lo from Hi part. Lo = DAG.getSelect(dl, NVT, isZero, InL, DAG.getSelect(dl, NVT, isShort, LoS, LoL)); Hi = DAG.getSelect(dl, NVT, isShort, HiS, HiL); return true; } } static std::pair getExpandedMinMaxOps(int Op) { switch (Op) { default: llvm_unreachable("invalid min/max opcode"); case ISD::SMAX: return std::make_pair(ISD::SETGT, ISD::UMAX); case ISD::UMAX: return std::make_pair(ISD::SETUGT, ISD::UMAX); case ISD::SMIN: return std::make_pair(ISD::SETLT, ISD::UMIN); case ISD::UMIN: return std::make_pair(ISD::SETULT, ISD::UMIN); } } void DAGTypeLegalizer::ExpandIntRes_MINMAX(SDNode *N, SDValue &Lo, SDValue &Hi) { SDLoc DL(N); ISD::NodeType LoOpc; ISD::CondCode CondC; std::tie(CondC, LoOpc) = getExpandedMinMaxOps(N->getOpcode()); // Expand the subcomponents. SDValue LHSL, LHSH, RHSL, RHSH; GetExpandedInteger(N->getOperand(0), LHSL, LHSH); GetExpandedInteger(N->getOperand(1), RHSL, RHSH); // Value types EVT NVT = LHSL.getValueType(); EVT CCT = getSetCCResultType(NVT); // Hi part is always the same op Hi = DAG.getNode(N->getOpcode(), DL, {NVT, NVT}, {LHSH, RHSH}); // We need to know whether to select Lo part that corresponds to 'winning' // Hi part or if Hi parts are equal. SDValue IsHiLeft = DAG.getSetCC(DL, CCT, LHSH, RHSH, CondC); SDValue IsHiEq = DAG.getSetCC(DL, CCT, LHSH, RHSH, ISD::SETEQ); // Lo part corresponding to the 'winning' Hi part SDValue LoCmp = DAG.getSelect(DL, NVT, IsHiLeft, LHSL, RHSL); // Recursed Lo part if Hi parts are equal, this uses unsigned version SDValue LoMinMax = DAG.getNode(LoOpc, DL, {NVT, NVT}, {LHSL, RHSL}); Lo = DAG.getSelect(DL, NVT, IsHiEq, LoMinMax, LoCmp); } void DAGTypeLegalizer::ExpandIntRes_ADDSUB(SDNode *N, SDValue &Lo, SDValue &Hi) { SDLoc dl(N); // Expand the subcomponents. SDValue LHSL, LHSH, RHSL, RHSH; GetExpandedInteger(N->getOperand(0), LHSL, LHSH); GetExpandedInteger(N->getOperand(1), RHSL, RHSH); EVT NVT = LHSL.getValueType(); SDValue LoOps[2] = { LHSL, RHSL }; SDValue HiOps[3] = { LHSH, RHSH }; // Do not generate ADDC/ADDE or SUBC/SUBE if the target does not support // them. TODO: Teach operation legalization how to expand unsupported // ADDC/ADDE/SUBC/SUBE. The problem is that these operations generate // a carry of type MVT::Glue, but there doesn't seem to be any way to // generate a value of this type in the expanded code sequence. bool hasCarry = TLI.isOperationLegalOrCustom(N->getOpcode() == ISD::ADD ? ISD::ADDC : ISD::SUBC, TLI.getTypeToExpandTo(*DAG.getContext(), NVT)); if (hasCarry) { SDVTList VTList = DAG.getVTList(NVT, MVT::Glue); if (N->getOpcode() == ISD::ADD) { Lo = DAG.getNode(ISD::ADDC, dl, VTList, LoOps); HiOps[2] = Lo.getValue(1); Hi = DAG.getNode(ISD::ADDE, dl, VTList, HiOps); } else { Lo = DAG.getNode(ISD::SUBC, dl, VTList, LoOps); HiOps[2] = Lo.getValue(1); Hi = DAG.getNode(ISD::SUBE, dl, VTList, HiOps); } return; } bool hasOVF = TLI.isOperationLegalOrCustom(N->getOpcode() == ISD::ADD ? ISD::UADDO : ISD::USUBO, TLI.getTypeToExpandTo(*DAG.getContext(), NVT)); if (hasOVF) { SDVTList VTList = DAG.getVTList(NVT, NVT); TargetLoweringBase::BooleanContent BoolType = TLI.getBooleanContents(NVT); int RevOpc; if (N->getOpcode() == ISD::ADD) { RevOpc = ISD::SUB; Lo = DAG.getNode(ISD::UADDO, dl, VTList, LoOps); Hi = DAG.getNode(ISD::ADD, dl, NVT, makeArrayRef(HiOps, 2)); } else { RevOpc = ISD::ADD; Lo = DAG.getNode(ISD::USUBO, dl, VTList, LoOps); Hi = DAG.getNode(ISD::SUB, dl, NVT, makeArrayRef(HiOps, 2)); } SDValue OVF = Lo.getValue(1); switch (BoolType) { case TargetLoweringBase::UndefinedBooleanContent: OVF = DAG.getNode(ISD::AND, dl, NVT, DAG.getConstant(1, dl, NVT), OVF); // Fallthrough case TargetLoweringBase::ZeroOrOneBooleanContent: Hi = DAG.getNode(N->getOpcode(), dl, NVT, Hi, OVF); break; case TargetLoweringBase::ZeroOrNegativeOneBooleanContent: Hi = DAG.getNode(RevOpc, dl, NVT, Hi, OVF); } return; } if (N->getOpcode() == ISD::ADD) { Lo = DAG.getNode(ISD::ADD, dl, NVT, LoOps); Hi = DAG.getNode(ISD::ADD, dl, NVT, makeArrayRef(HiOps, 2)); SDValue Cmp1 = DAG.getSetCC(dl, getSetCCResultType(NVT), Lo, LoOps[0], ISD::SETULT); SDValue Carry1 = DAG.getSelect(dl, NVT, Cmp1, DAG.getConstant(1, dl, NVT), DAG.getConstant(0, dl, NVT)); SDValue Cmp2 = DAG.getSetCC(dl, getSetCCResultType(NVT), Lo, LoOps[1], ISD::SETULT); SDValue Carry2 = DAG.getSelect(dl, NVT, Cmp2, DAG.getConstant(1, dl, NVT), Carry1); Hi = DAG.getNode(ISD::ADD, dl, NVT, Hi, Carry2); } else { Lo = DAG.getNode(ISD::SUB, dl, NVT, LoOps); Hi = DAG.getNode(ISD::SUB, dl, NVT, makeArrayRef(HiOps, 2)); SDValue Cmp = DAG.getSetCC(dl, getSetCCResultType(LoOps[0].getValueType()), LoOps[0], LoOps[1], ISD::SETULT); SDValue Borrow = DAG.getSelect(dl, NVT, Cmp, DAG.getConstant(1, dl, NVT), DAG.getConstant(0, dl, NVT)); Hi = DAG.getNode(ISD::SUB, dl, NVT, Hi, Borrow); } } void DAGTypeLegalizer::ExpandIntRes_ADDSUBC(SDNode *N, SDValue &Lo, SDValue &Hi) { // Expand the subcomponents. SDValue LHSL, LHSH, RHSL, RHSH; SDLoc dl(N); GetExpandedInteger(N->getOperand(0), LHSL, LHSH); GetExpandedInteger(N->getOperand(1), RHSL, RHSH); SDVTList VTList = DAG.getVTList(LHSL.getValueType(), MVT::Glue); SDValue LoOps[2] = { LHSL, RHSL }; SDValue HiOps[3] = { LHSH, RHSH }; if (N->getOpcode() == ISD::ADDC) { Lo = DAG.getNode(ISD::ADDC, dl, VTList, LoOps); HiOps[2] = Lo.getValue(1); Hi = DAG.getNode(ISD::ADDE, dl, VTList, HiOps); } else { Lo = DAG.getNode(ISD::SUBC, dl, VTList, LoOps); HiOps[2] = Lo.getValue(1); Hi = DAG.getNode(ISD::SUBE, dl, VTList, HiOps); } // Legalized the flag result - switch anything that used the old flag to // use the new one. ReplaceValueWith(SDValue(N, 1), Hi.getValue(1)); } void DAGTypeLegalizer::ExpandIntRes_ADDSUBE(SDNode *N, SDValue &Lo, SDValue &Hi) { // Expand the subcomponents. SDValue LHSL, LHSH, RHSL, RHSH; SDLoc dl(N); GetExpandedInteger(N->getOperand(0), LHSL, LHSH); GetExpandedInteger(N->getOperand(1), RHSL, RHSH); SDVTList VTList = DAG.getVTList(LHSL.getValueType(), MVT::Glue); SDValue LoOps[3] = { LHSL, RHSL, N->getOperand(2) }; SDValue HiOps[3] = { LHSH, RHSH }; Lo = DAG.getNode(N->getOpcode(), dl, VTList, LoOps); HiOps[2] = Lo.getValue(1); Hi = DAG.getNode(N->getOpcode(), dl, VTList, HiOps); // Legalized the flag result - switch anything that used the old flag to // use the new one. ReplaceValueWith(SDValue(N, 1), Hi.getValue(1)); } void DAGTypeLegalizer::ExpandIntRes_ANY_EXTEND(SDNode *N, SDValue &Lo, SDValue &Hi) { EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0)); SDLoc dl(N); SDValue Op = N->getOperand(0); if (Op.getValueType().bitsLE(NVT)) { // The low part is any extension of the input (which degenerates to a copy). Lo = DAG.getNode(ISD::ANY_EXTEND, dl, NVT, Op); Hi = DAG.getUNDEF(NVT); // The high part is undefined. } else { // For example, extension of an i48 to an i64. The operand type necessarily // promotes to the result type, so will end up being expanded too. assert(getTypeAction(Op.getValueType()) == TargetLowering::TypePromoteInteger && "Only know how to promote this result!"); SDValue Res = GetPromotedInteger(Op); assert(Res.getValueType() == N->getValueType(0) && "Operand over promoted?"); // Split the promoted operand. This will simplify when it is expanded. SplitInteger(Res, Lo, Hi); } } void DAGTypeLegalizer::ExpandIntRes_AssertSext(SDNode *N, SDValue &Lo, SDValue &Hi) { SDLoc dl(N); GetExpandedInteger(N->getOperand(0), Lo, Hi); EVT NVT = Lo.getValueType(); EVT EVT = cast(N->getOperand(1))->getVT(); unsigned NVTBits = NVT.getSizeInBits(); unsigned EVTBits = EVT.getSizeInBits(); if (NVTBits < EVTBits) { Hi = DAG.getNode(ISD::AssertSext, dl, NVT, Hi, DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), EVTBits - NVTBits))); } else { Lo = DAG.getNode(ISD::AssertSext, dl, NVT, Lo, DAG.getValueType(EVT)); // The high part replicates the sign bit of Lo, make it explicit. Hi = DAG.getNode(ISD::SRA, dl, NVT, Lo, DAG.getConstant(NVTBits - 1, dl, TLI.getPointerTy(DAG.getDataLayout()))); } } void DAGTypeLegalizer::ExpandIntRes_AssertZext(SDNode *N, SDValue &Lo, SDValue &Hi) { SDLoc dl(N); GetExpandedInteger(N->getOperand(0), Lo, Hi); EVT NVT = Lo.getValueType(); EVT EVT = cast(N->getOperand(1))->getVT(); unsigned NVTBits = NVT.getSizeInBits(); unsigned EVTBits = EVT.getSizeInBits(); if (NVTBits < EVTBits) { Hi = DAG.getNode(ISD::AssertZext, dl, NVT, Hi, DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), EVTBits - NVTBits))); } else { Lo = DAG.getNode(ISD::AssertZext, dl, NVT, Lo, DAG.getValueType(EVT)); // The high part must be zero, make it explicit. Hi = DAG.getConstant(0, dl, NVT); } } void DAGTypeLegalizer::ExpandIntRes_BITREVERSE(SDNode *N, SDValue &Lo, SDValue &Hi) { SDLoc dl(N); GetExpandedInteger(N->getOperand(0), Hi, Lo); // Note swapped operands. Lo = DAG.getNode(ISD::BITREVERSE, dl, Lo.getValueType(), Lo); Hi = DAG.getNode(ISD::BITREVERSE, dl, Hi.getValueType(), Hi); } void DAGTypeLegalizer::ExpandIntRes_BSWAP(SDNode *N, SDValue &Lo, SDValue &Hi) { SDLoc dl(N); GetExpandedInteger(N->getOperand(0), Hi, Lo); // Note swapped operands. Lo = DAG.getNode(ISD::BSWAP, dl, Lo.getValueType(), Lo); Hi = DAG.getNode(ISD::BSWAP, dl, Hi.getValueType(), Hi); } void DAGTypeLegalizer::ExpandIntRes_Constant(SDNode *N, SDValue &Lo, SDValue &Hi) { EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0)); unsigned NBitWidth = NVT.getSizeInBits(); auto Constant = cast(N); const APInt &Cst = Constant->getAPIntValue(); bool IsTarget = Constant->isTargetOpcode(); bool IsOpaque = Constant->isOpaque(); SDLoc dl(N); Lo = DAG.getConstant(Cst.trunc(NBitWidth), dl, NVT, IsTarget, IsOpaque); Hi = DAG.getConstant(Cst.lshr(NBitWidth).trunc(NBitWidth), dl, NVT, IsTarget, IsOpaque); } void DAGTypeLegalizer::ExpandIntRes_CTLZ(SDNode *N, SDValue &Lo, SDValue &Hi) { SDLoc dl(N); // ctlz (HiLo) -> Hi != 0 ? ctlz(Hi) : (ctlz(Lo)+32) GetExpandedInteger(N->getOperand(0), Lo, Hi); EVT NVT = Lo.getValueType(); SDValue HiNotZero = DAG.getSetCC(dl, getSetCCResultType(NVT), Hi, DAG.getConstant(0, dl, NVT), ISD::SETNE); SDValue LoLZ = DAG.getNode(N->getOpcode(), dl, NVT, Lo); SDValue HiLZ = DAG.getNode(ISD::CTLZ_ZERO_UNDEF, dl, NVT, Hi); Lo = DAG.getSelect(dl, NVT, HiNotZero, HiLZ, DAG.getNode(ISD::ADD, dl, NVT, LoLZ, DAG.getConstant(NVT.getSizeInBits(), dl, NVT))); Hi = DAG.getConstant(0, dl, NVT); } void DAGTypeLegalizer::ExpandIntRes_CTPOP(SDNode *N, SDValue &Lo, SDValue &Hi) { SDLoc dl(N); // ctpop(HiLo) -> ctpop(Hi)+ctpop(Lo) GetExpandedInteger(N->getOperand(0), Lo, Hi); EVT NVT = Lo.getValueType(); Lo = DAG.getNode(ISD::ADD, dl, NVT, DAG.getNode(ISD::CTPOP, dl, NVT, Lo), DAG.getNode(ISD::CTPOP, dl, NVT, Hi)); Hi = DAG.getConstant(0, dl, NVT); } void DAGTypeLegalizer::ExpandIntRes_CTTZ(SDNode *N, SDValue &Lo, SDValue &Hi) { SDLoc dl(N); // cttz (HiLo) -> Lo != 0 ? cttz(Lo) : (cttz(Hi)+32) GetExpandedInteger(N->getOperand(0), Lo, Hi); EVT NVT = Lo.getValueType(); SDValue LoNotZero = DAG.getSetCC(dl, getSetCCResultType(NVT), Lo, DAG.getConstant(0, dl, NVT), ISD::SETNE); SDValue LoLZ = DAG.getNode(ISD::CTTZ_ZERO_UNDEF, dl, NVT, Lo); SDValue HiLZ = DAG.getNode(N->getOpcode(), dl, NVT, Hi); Lo = DAG.getSelect(dl, NVT, LoNotZero, LoLZ, DAG.getNode(ISD::ADD, dl, NVT, HiLZ, DAG.getConstant(NVT.getSizeInBits(), dl, NVT))); Hi = DAG.getConstant(0, dl, NVT); } void DAGTypeLegalizer::ExpandIntRes_FP_TO_SINT(SDNode *N, SDValue &Lo, SDValue &Hi) { SDLoc dl(N); EVT VT = N->getValueType(0); SDValue Op = N->getOperand(0); if (getTypeAction(Op.getValueType()) == TargetLowering::TypePromoteFloat) Op = GetPromotedFloat(Op); RTLIB::Libcall LC = RTLIB::getFPTOSINT(Op.getValueType(), VT); assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected fp-to-sint conversion!"); SplitInteger(TLI.makeLibCall(DAG, LC, VT, Op, true/*irrelevant*/, dl).first, Lo, Hi); } void DAGTypeLegalizer::ExpandIntRes_FP_TO_UINT(SDNode *N, SDValue &Lo, SDValue &Hi) { SDLoc dl(N); EVT VT = N->getValueType(0); SDValue Op = N->getOperand(0); if (getTypeAction(Op.getValueType()) == TargetLowering::TypePromoteFloat) Op = GetPromotedFloat(Op); RTLIB::Libcall LC = RTLIB::getFPTOUINT(Op.getValueType(), VT); assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected fp-to-uint conversion!"); SplitInteger(TLI.makeLibCall(DAG, LC, VT, Op, false/*irrelevant*/, dl).first, Lo, Hi); } void DAGTypeLegalizer::ExpandIntRes_LOAD(LoadSDNode *N, SDValue &Lo, SDValue &Hi) { if (ISD::isNormalLoad(N)) { ExpandRes_NormalLoad(N, Lo, Hi); return; } assert(ISD::isUNINDEXEDLoad(N) && "Indexed load during type legalization!"); EVT VT = N->getValueType(0); EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT); SDValue Ch = N->getChain(); SDValue Ptr = N->getBasePtr(); ISD::LoadExtType ExtType = N->getExtensionType(); unsigned Alignment = N->getAlignment(); MachineMemOperand::Flags MMOFlags = N->getMemOperand()->getFlags(); AAMDNodes AAInfo = N->getAAInfo(); SDLoc dl(N); assert(NVT.isByteSized() && "Expanded type not byte sized!"); if (N->getMemoryVT().bitsLE(NVT)) { EVT MemVT = N->getMemoryVT(); Lo = DAG.getExtLoad(ExtType, dl, NVT, Ch, Ptr, N->getPointerInfo(), MemVT, Alignment, MMOFlags, AAInfo); // Remember the chain. Ch = Lo.getValue(1); if (ExtType == ISD::SEXTLOAD) { // The high part is obtained by SRA'ing all but one of the bits of the // lo part. unsigned LoSize = Lo.getValueType().getSizeInBits(); Hi = DAG.getNode(ISD::SRA, dl, NVT, Lo, DAG.getConstant(LoSize - 1, dl, TLI.getPointerTy(DAG.getDataLayout()))); } else if (ExtType == ISD::ZEXTLOAD) { // The high part is just a zero. Hi = DAG.getConstant(0, dl, NVT); } else { assert(ExtType == ISD::EXTLOAD && "Unknown extload!"); // The high part is undefined. Hi = DAG.getUNDEF(NVT); } } else if (DAG.getDataLayout().isLittleEndian()) { // Little-endian - low bits are at low addresses. Lo = DAG.getLoad(NVT, dl, Ch, Ptr, N->getPointerInfo(), Alignment, MMOFlags, AAInfo); unsigned ExcessBits = N->getMemoryVT().getSizeInBits() - NVT.getSizeInBits(); EVT NEVT = EVT::getIntegerVT(*DAG.getContext(), ExcessBits); // Increment the pointer to the other half. unsigned IncrementSize = NVT.getSizeInBits()/8; Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, DAG.getConstant(IncrementSize, dl, Ptr.getValueType())); Hi = DAG.getExtLoad(ExtType, dl, NVT, Ch, Ptr, N->getPointerInfo().getWithOffset(IncrementSize), NEVT, MinAlign(Alignment, IncrementSize), MMOFlags, AAInfo); // Build a factor node to remember that this load is independent of the // other one. Ch = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo.getValue(1), Hi.getValue(1)); } else { // Big-endian - high bits are at low addresses. Favor aligned loads at // the cost of some bit-fiddling. EVT MemVT = N->getMemoryVT(); unsigned EBytes = MemVT.getStoreSize(); unsigned IncrementSize = NVT.getSizeInBits()/8; unsigned ExcessBits = (EBytes - IncrementSize)*8; // Load both the high bits and maybe some of the low bits. Hi = DAG.getExtLoad(ExtType, dl, NVT, Ch, Ptr, N->getPointerInfo(), EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits() - ExcessBits), Alignment, MMOFlags, AAInfo); // Increment the pointer to the other half. Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, DAG.getConstant(IncrementSize, dl, Ptr.getValueType())); // Load the rest of the low bits. Lo = DAG.getExtLoad(ISD::ZEXTLOAD, dl, NVT, Ch, Ptr, N->getPointerInfo().getWithOffset(IncrementSize), EVT::getIntegerVT(*DAG.getContext(), ExcessBits), MinAlign(Alignment, IncrementSize), MMOFlags, AAInfo); // Build a factor node to remember that this load is independent of the // other one. Ch = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo.getValue(1), Hi.getValue(1)); if (ExcessBits < NVT.getSizeInBits()) { // Transfer low bits from the bottom of Hi to the top of Lo. Lo = DAG.getNode( ISD::OR, dl, NVT, Lo, DAG.getNode(ISD::SHL, dl, NVT, Hi, DAG.getConstant(ExcessBits, dl, TLI.getPointerTy(DAG.getDataLayout())))); // Move high bits to the right position in Hi. Hi = DAG.getNode(ExtType == ISD::SEXTLOAD ? ISD::SRA : ISD::SRL, dl, NVT, Hi, DAG.getConstant(NVT.getSizeInBits() - ExcessBits, dl, TLI.getPointerTy(DAG.getDataLayout()))); } } // Legalize the chain result - switch anything that used the old chain to // use the new one. ReplaceValueWith(SDValue(N, 1), Ch); } void DAGTypeLegalizer::ExpandIntRes_Logical(SDNode *N, SDValue &Lo, SDValue &Hi) { SDLoc dl(N); SDValue LL, LH, RL, RH; GetExpandedInteger(N->getOperand(0), LL, LH); GetExpandedInteger(N->getOperand(1), RL, RH); Lo = DAG.getNode(N->getOpcode(), dl, LL.getValueType(), LL, RL); Hi = DAG.getNode(N->getOpcode(), dl, LL.getValueType(), LH, RH); } void DAGTypeLegalizer::ExpandIntRes_MUL(SDNode *N, SDValue &Lo, SDValue &Hi) { EVT VT = N->getValueType(0); EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT); SDLoc dl(N); SDValue LL, LH, RL, RH; GetExpandedInteger(N->getOperand(0), LL, LH); GetExpandedInteger(N->getOperand(1), RL, RH); if (TLI.expandMUL(N, Lo, Hi, NVT, DAG, LL, LH, RL, RH)) return; // If nothing else, we can make a libcall. RTLIB::Libcall LC = RTLIB::UNKNOWN_LIBCALL; if (VT == MVT::i16) LC = RTLIB::MUL_I16; else if (VT == MVT::i32) LC = RTLIB::MUL_I32; else if (VT == MVT::i64) LC = RTLIB::MUL_I64; else if (VT == MVT::i128) LC = RTLIB::MUL_I128; if (LC == RTLIB::UNKNOWN_LIBCALL) { // We'll expand the multiplication by brute force because we have no other // options. This is a trivially-generalized version of the code from // Hacker's Delight (itself derived from Knuth's Algorithm M from section // 4.3.1). - SDValue Mask = - DAG.getConstant(APInt::getLowBitsSet(NVT.getSizeInBits(), - NVT.getSizeInBits() >> 1), dl, NVT); + unsigned Bits = NVT.getSizeInBits(); + unsigned HalfBits = Bits >> 1; + SDValue Mask = DAG.getConstant(APInt::getLowBitsSet(Bits, HalfBits), dl, + NVT); SDValue LLL = DAG.getNode(ISD::AND, dl, NVT, LL, Mask); SDValue RLL = DAG.getNode(ISD::AND, dl, NVT, RL, Mask); SDValue T = DAG.getNode(ISD::MUL, dl, NVT, LLL, RLL); SDValue TL = DAG.getNode(ISD::AND, dl, NVT, T, Mask); - SDValue Shift = - DAG.getConstant(NVT.getSizeInBits() >> 1, dl, - TLI.getShiftAmountTy(NVT, DAG.getDataLayout())); + EVT ShiftAmtTy = TLI.getShiftAmountTy(NVT, DAG.getDataLayout()); + if (APInt::getMaxValue(ShiftAmtTy.getSizeInBits()).ult(HalfBits)) { + // The type from TLI is too small to fit the shift amount we want. + // Override it with i32. The shift will have to be legalized. + ShiftAmtTy = MVT::i32; + } + SDValue Shift = DAG.getConstant(HalfBits, dl, ShiftAmtTy); SDValue TH = DAG.getNode(ISD::SRL, dl, NVT, T, Shift); SDValue LLH = DAG.getNode(ISD::SRL, dl, NVT, LL, Shift); SDValue RLH = DAG.getNode(ISD::SRL, dl, NVT, RL, Shift); SDValue U = DAG.getNode(ISD::ADD, dl, NVT, - DAG.getNode(ISD::MUL, dl, NVT, LLH, RLL), TL); + DAG.getNode(ISD::MUL, dl, NVT, LLH, RLL), TH); SDValue UL = DAG.getNode(ISD::AND, dl, NVT, U, Mask); SDValue UH = DAG.getNode(ISD::SRL, dl, NVT, U, Shift); SDValue V = DAG.getNode(ISD::ADD, dl, NVT, DAG.getNode(ISD::MUL, dl, NVT, LLL, RLH), UL); SDValue VH = DAG.getNode(ISD::SRL, dl, NVT, V, Shift); SDValue W = DAG.getNode(ISD::ADD, dl, NVT, - DAG.getNode(ISD::MUL, dl, NVT, LL, RL), + DAG.getNode(ISD::MUL, dl, NVT, LLH, RLH), DAG.getNode(ISD::ADD, dl, NVT, UH, VH)); - Lo = DAG.getNode(ISD::ADD, dl, NVT, TH, + Lo = DAG.getNode(ISD::ADD, dl, NVT, TL, DAG.getNode(ISD::SHL, dl, NVT, V, Shift)); Hi = DAG.getNode(ISD::ADD, dl, NVT, W, DAG.getNode(ISD::ADD, dl, NVT, - DAG.getNode(ISD::MUL, dl, NVT, RH, LL), + DAG.getNode(ISD::MUL, dl, NVT, RH, LL), DAG.getNode(ISD::MUL, dl, NVT, RL, LH))); return; } SDValue Ops[2] = { N->getOperand(0), N->getOperand(1) }; SplitInteger(TLI.makeLibCall(DAG, LC, VT, Ops, true/*irrelevant*/, dl).first, Lo, Hi); } void DAGTypeLegalizer::ExpandIntRes_READCYCLECOUNTER(SDNode *N, SDValue &Lo, SDValue &Hi) { SDLoc DL(N); EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0)); SDVTList VTs = DAG.getVTList(NVT, NVT, MVT::Other); SDValue R = DAG.getNode(N->getOpcode(), DL, VTs, N->getOperand(0)); Lo = R.getValue(0); Hi = R.getValue(1); ReplaceValueWith(SDValue(N, 1), R.getValue(2)); } void DAGTypeLegalizer::ExpandIntRes_SADDSUBO(SDNode *Node, SDValue &Lo, SDValue &Hi) { SDValue LHS = Node->getOperand(0); SDValue RHS = Node->getOperand(1); SDLoc dl(Node); // Expand the result by simply replacing it with the equivalent // non-overflow-checking operation. SDValue Sum = DAG.getNode(Node->getOpcode() == ISD::SADDO ? ISD::ADD : ISD::SUB, dl, LHS.getValueType(), LHS, RHS); SplitInteger(Sum, Lo, Hi); // Compute the overflow. // // LHSSign -> LHS >= 0 // RHSSign -> RHS >= 0 // SumSign -> Sum >= 0 // // Add: // Overflow -> (LHSSign == RHSSign) && (LHSSign != SumSign) // Sub: // Overflow -> (LHSSign != RHSSign) && (LHSSign != SumSign) // EVT OType = Node->getValueType(1); SDValue Zero = DAG.getConstant(0, dl, LHS.getValueType()); SDValue LHSSign = DAG.getSetCC(dl, OType, LHS, Zero, ISD::SETGE); SDValue RHSSign = DAG.getSetCC(dl, OType, RHS, Zero, ISD::SETGE); SDValue SignsMatch = DAG.getSetCC(dl, OType, LHSSign, RHSSign, Node->getOpcode() == ISD::SADDO ? ISD::SETEQ : ISD::SETNE); SDValue SumSign = DAG.getSetCC(dl, OType, Sum, Zero, ISD::SETGE); SDValue SumSignNE = DAG.getSetCC(dl, OType, LHSSign, SumSign, ISD::SETNE); SDValue Cmp = DAG.getNode(ISD::AND, dl, OType, SignsMatch, SumSignNE); // Use the calculated overflow everywhere. ReplaceValueWith(SDValue(Node, 1), Cmp); } void DAGTypeLegalizer::ExpandIntRes_SDIV(SDNode *N, SDValue &Lo, SDValue &Hi) { EVT VT = N->getValueType(0); SDLoc dl(N); SDValue Ops[2] = { N->getOperand(0), N->getOperand(1) }; if (TLI.getOperationAction(ISD::SDIVREM, VT) == TargetLowering::Custom) { SDValue Res = DAG.getNode(ISD::SDIVREM, dl, DAG.getVTList(VT, VT), Ops); SplitInteger(Res.getValue(0), Lo, Hi); return; } RTLIB::Libcall LC = RTLIB::UNKNOWN_LIBCALL; if (VT == MVT::i16) LC = RTLIB::SDIV_I16; else if (VT == MVT::i32) LC = RTLIB::SDIV_I32; else if (VT == MVT::i64) LC = RTLIB::SDIV_I64; else if (VT == MVT::i128) LC = RTLIB::SDIV_I128; assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported SDIV!"); SplitInteger(TLI.makeLibCall(DAG, LC, VT, Ops, true, dl).first, Lo, Hi); } void DAGTypeLegalizer::ExpandIntRes_Shift(SDNode *N, SDValue &Lo, SDValue &Hi) { EVT VT = N->getValueType(0); SDLoc dl(N); // If we can emit an efficient shift operation, do so now. Check to see if // the RHS is a constant. if (ConstantSDNode *CN = dyn_cast(N->getOperand(1))) return ExpandShiftByConstant(N, CN->getAPIntValue(), Lo, Hi); // If we can determine that the high bit of the shift is zero or one, even if // the low bits are variable, emit this shift in an optimized form. if (ExpandShiftWithKnownAmountBit(N, Lo, Hi)) return; // If this target supports shift_PARTS, use it. First, map to the _PARTS opc. unsigned PartsOpc; if (N->getOpcode() == ISD::SHL) { PartsOpc = ISD::SHL_PARTS; } else if (N->getOpcode() == ISD::SRL) { PartsOpc = ISD::SRL_PARTS; } else { assert(N->getOpcode() == ISD::SRA && "Unknown shift!"); PartsOpc = ISD::SRA_PARTS; } // Next check to see if the target supports this SHL_PARTS operation or if it // will custom expand it. EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT); TargetLowering::LegalizeAction Action = TLI.getOperationAction(PartsOpc, NVT); if ((Action == TargetLowering::Legal && TLI.isTypeLegal(NVT)) || Action == TargetLowering::Custom) { // Expand the subcomponents. SDValue LHSL, LHSH; GetExpandedInteger(N->getOperand(0), LHSL, LHSH); EVT VT = LHSL.getValueType(); // If the shift amount operand is coming from a vector legalization it may // have an illegal type. Fix that first by casting the operand, otherwise // the new SHL_PARTS operation would need further legalization. SDValue ShiftOp = N->getOperand(1); EVT ShiftTy = TLI.getShiftAmountTy(VT, DAG.getDataLayout()); assert(ShiftTy.getScalarType().getSizeInBits() >= Log2_32_Ceil(VT.getScalarType().getSizeInBits()) && "ShiftAmountTy is too small to cover the range of this type!"); if (ShiftOp.getValueType() != ShiftTy) ShiftOp = DAG.getZExtOrTrunc(ShiftOp, dl, ShiftTy); SDValue Ops[] = { LHSL, LHSH, ShiftOp }; Lo = DAG.getNode(PartsOpc, dl, DAG.getVTList(VT, VT), Ops); Hi = Lo.getValue(1); return; } // Otherwise, emit a libcall. RTLIB::Libcall LC = RTLIB::UNKNOWN_LIBCALL; bool isSigned; if (N->getOpcode() == ISD::SHL) { isSigned = false; /*sign irrelevant*/ if (VT == MVT::i16) LC = RTLIB::SHL_I16; else if (VT == MVT::i32) LC = RTLIB::SHL_I32; else if (VT == MVT::i64) LC = RTLIB::SHL_I64; else if (VT == MVT::i128) LC = RTLIB::SHL_I128; } else if (N->getOpcode() == ISD::SRL) { isSigned = false; if (VT == MVT::i16) LC = RTLIB::SRL_I16; else if (VT == MVT::i32) LC = RTLIB::SRL_I32; else if (VT == MVT::i64) LC = RTLIB::SRL_I64; else if (VT == MVT::i128) LC = RTLIB::SRL_I128; } else { assert(N->getOpcode() == ISD::SRA && "Unknown shift!"); isSigned = true; if (VT == MVT::i16) LC = RTLIB::SRA_I16; else if (VT == MVT::i32) LC = RTLIB::SRA_I32; else if (VT == MVT::i64) LC = RTLIB::SRA_I64; else if (VT == MVT::i128) LC = RTLIB::SRA_I128; } if (LC != RTLIB::UNKNOWN_LIBCALL && TLI.getLibcallName(LC)) { SDValue Ops[2] = { N->getOperand(0), N->getOperand(1) }; SplitInteger(TLI.makeLibCall(DAG, LC, VT, Ops, isSigned, dl).first, Lo, Hi); return; } if (!ExpandShiftWithUnknownAmountBit(N, Lo, Hi)) llvm_unreachable("Unsupported shift!"); } void DAGTypeLegalizer::ExpandIntRes_SIGN_EXTEND(SDNode *N, SDValue &Lo, SDValue &Hi) { EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0)); SDLoc dl(N); SDValue Op = N->getOperand(0); if (Op.getValueType().bitsLE(NVT)) { // The low part is sign extension of the input (degenerates to a copy). Lo = DAG.getNode(ISD::SIGN_EXTEND, dl, NVT, N->getOperand(0)); // The high part is obtained by SRA'ing all but one of the bits of low part. unsigned LoSize = NVT.getSizeInBits(); Hi = DAG.getNode( ISD::SRA, dl, NVT, Lo, DAG.getConstant(LoSize - 1, dl, TLI.getPointerTy(DAG.getDataLayout()))); } else { // For example, extension of an i48 to an i64. The operand type necessarily // promotes to the result type, so will end up being expanded too. assert(getTypeAction(Op.getValueType()) == TargetLowering::TypePromoteInteger && "Only know how to promote this result!"); SDValue Res = GetPromotedInteger(Op); assert(Res.getValueType() == N->getValueType(0) && "Operand over promoted?"); // Split the promoted operand. This will simplify when it is expanded. SplitInteger(Res, Lo, Hi); unsigned ExcessBits = Op.getValueType().getSizeInBits() - NVT.getSizeInBits(); Hi = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, Hi.getValueType(), Hi, DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), ExcessBits))); } } void DAGTypeLegalizer:: ExpandIntRes_SIGN_EXTEND_INREG(SDNode *N, SDValue &Lo, SDValue &Hi) { SDLoc dl(N); GetExpandedInteger(N->getOperand(0), Lo, Hi); EVT EVT = cast(N->getOperand(1))->getVT(); if (EVT.bitsLE(Lo.getValueType())) { // sext_inreg the low part if needed. Lo = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, Lo.getValueType(), Lo, N->getOperand(1)); // The high part gets the sign extension from the lo-part. This handles // things like sextinreg V:i64 from i8. Hi = DAG.getNode(ISD::SRA, dl, Hi.getValueType(), Lo, DAG.getConstant(Hi.getValueType().getSizeInBits() - 1, dl, TLI.getPointerTy(DAG.getDataLayout()))); } else { // For example, extension of an i48 to an i64. Leave the low part alone, // sext_inreg the high part. unsigned ExcessBits = EVT.getSizeInBits() - Lo.getValueType().getSizeInBits(); Hi = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, Hi.getValueType(), Hi, DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), ExcessBits))); } } void DAGTypeLegalizer::ExpandIntRes_SREM(SDNode *N, SDValue &Lo, SDValue &Hi) { EVT VT = N->getValueType(0); SDLoc dl(N); SDValue Ops[2] = { N->getOperand(0), N->getOperand(1) }; if (TLI.getOperationAction(ISD::SDIVREM, VT) == TargetLowering::Custom) { SDValue Res = DAG.getNode(ISD::SDIVREM, dl, DAG.getVTList(VT, VT), Ops); SplitInteger(Res.getValue(1), Lo, Hi); return; } RTLIB::Libcall LC = RTLIB::UNKNOWN_LIBCALL; if (VT == MVT::i16) LC = RTLIB::SREM_I16; else if (VT == MVT::i32) LC = RTLIB::SREM_I32; else if (VT == MVT::i64) LC = RTLIB::SREM_I64; else if (VT == MVT::i128) LC = RTLIB::SREM_I128; assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported SREM!"); SplitInteger(TLI.makeLibCall(DAG, LC, VT, Ops, true, dl).first, Lo, Hi); } void DAGTypeLegalizer::ExpandIntRes_TRUNCATE(SDNode *N, SDValue &Lo, SDValue &Hi) { EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0)); SDLoc dl(N); Lo = DAG.getNode(ISD::TRUNCATE, dl, NVT, N->getOperand(0)); Hi = DAG.getNode(ISD::SRL, dl, N->getOperand(0).getValueType(), N->getOperand(0), DAG.getConstant(NVT.getSizeInBits(), dl, TLI.getPointerTy(DAG.getDataLayout()))); Hi = DAG.getNode(ISD::TRUNCATE, dl, NVT, Hi); } void DAGTypeLegalizer::ExpandIntRes_UADDSUBO(SDNode *N, SDValue &Lo, SDValue &Hi) { SDValue LHS = N->getOperand(0); SDValue RHS = N->getOperand(1); SDLoc dl(N); // Expand the result by simply replacing it with the equivalent // non-overflow-checking operation. SDValue Sum = DAG.getNode(N->getOpcode() == ISD::UADDO ? ISD::ADD : ISD::SUB, dl, LHS.getValueType(), LHS, RHS); SplitInteger(Sum, Lo, Hi); // Calculate the overflow: addition overflows iff a + b < a, and subtraction // overflows iff a - b > a. SDValue Ofl = DAG.getSetCC(dl, N->getValueType(1), Sum, LHS, N->getOpcode () == ISD::UADDO ? ISD::SETULT : ISD::SETUGT); // Use the calculated overflow everywhere. ReplaceValueWith(SDValue(N, 1), Ofl); } void DAGTypeLegalizer::ExpandIntRes_XMULO(SDNode *N, SDValue &Lo, SDValue &Hi) { EVT VT = N->getValueType(0); SDLoc dl(N); // A divide for UMULO should be faster than a function call. if (N->getOpcode() == ISD::UMULO) { SDValue LHS = N->getOperand(0), RHS = N->getOperand(1); SDValue MUL = DAG.getNode(ISD::MUL, dl, LHS.getValueType(), LHS, RHS); SplitInteger(MUL, Lo, Hi); // A divide for UMULO will be faster than a function call. Select to // make sure we aren't using 0. SDValue isZero = DAG.getSetCC(dl, getSetCCResultType(VT), RHS, DAG.getConstant(0, dl, VT), ISD::SETEQ); SDValue NotZero = DAG.getSelect(dl, VT, isZero, DAG.getConstant(1, dl, VT), RHS); SDValue DIV = DAG.getNode(ISD::UDIV, dl, VT, MUL, NotZero); SDValue Overflow = DAG.getSetCC(dl, N->getValueType(1), DIV, LHS, ISD::SETNE); Overflow = DAG.getSelect(dl, N->getValueType(1), isZero, DAG.getConstant(0, dl, N->getValueType(1)), Overflow); ReplaceValueWith(SDValue(N, 1), Overflow); return; } Type *RetTy = VT.getTypeForEVT(*DAG.getContext()); EVT PtrVT = TLI.getPointerTy(DAG.getDataLayout()); Type *PtrTy = PtrVT.getTypeForEVT(*DAG.getContext()); // Replace this with a libcall that will check overflow. RTLIB::Libcall LC = RTLIB::UNKNOWN_LIBCALL; if (VT == MVT::i32) LC = RTLIB::MULO_I32; else if (VT == MVT::i64) LC = RTLIB::MULO_I64; else if (VT == MVT::i128) LC = RTLIB::MULO_I128; assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported XMULO!"); SDValue Temp = DAG.CreateStackTemporary(PtrVT); // Temporary for the overflow value, default it to zero. SDValue Chain = DAG.getStore(DAG.getEntryNode(), dl, DAG.getConstant(0, dl, PtrVT), Temp, MachinePointerInfo()); TargetLowering::ArgListTy Args; TargetLowering::ArgListEntry Entry; for (const SDValue &Op : N->op_values()) { EVT ArgVT = Op.getValueType(); Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); Entry.Node = Op; Entry.Ty = ArgTy; Entry.isSExt = true; Entry.isZExt = false; Args.push_back(Entry); } // Also pass the address of the overflow check. Entry.Node = Temp; Entry.Ty = PtrTy->getPointerTo(); Entry.isSExt = true; Entry.isZExt = false; Args.push_back(Entry); SDValue Func = DAG.getExternalSymbol(TLI.getLibcallName(LC), PtrVT); TargetLowering::CallLoweringInfo CLI(DAG); CLI.setDebugLoc(dl).setChain(Chain) .setCallee(TLI.getLibcallCallingConv(LC), RetTy, Func, std::move(Args)) .setSExtResult(); std::pair CallInfo = TLI.LowerCallTo(CLI); SplitInteger(CallInfo.first, Lo, Hi); SDValue Temp2 = DAG.getLoad(PtrVT, dl, CallInfo.second, Temp, MachinePointerInfo()); SDValue Ofl = DAG.getSetCC(dl, N->getValueType(1), Temp2, DAG.getConstant(0, dl, PtrVT), ISD::SETNE); // Use the overflow from the libcall everywhere. ReplaceValueWith(SDValue(N, 1), Ofl); } void DAGTypeLegalizer::ExpandIntRes_UDIV(SDNode *N, SDValue &Lo, SDValue &Hi) { EVT VT = N->getValueType(0); SDLoc dl(N); SDValue Ops[2] = { N->getOperand(0), N->getOperand(1) }; if (TLI.getOperationAction(ISD::UDIVREM, VT) == TargetLowering::Custom) { SDValue Res = DAG.getNode(ISD::UDIVREM, dl, DAG.getVTList(VT, VT), Ops); SplitInteger(Res.getValue(0), Lo, Hi); return; } RTLIB::Libcall LC = RTLIB::UNKNOWN_LIBCALL; if (VT == MVT::i16) LC = RTLIB::UDIV_I16; else if (VT == MVT::i32) LC = RTLIB::UDIV_I32; else if (VT == MVT::i64) LC = RTLIB::UDIV_I64; else if (VT == MVT::i128) LC = RTLIB::UDIV_I128; assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported UDIV!"); SplitInteger(TLI.makeLibCall(DAG, LC, VT, Ops, false, dl).first, Lo, Hi); } void DAGTypeLegalizer::ExpandIntRes_UREM(SDNode *N, SDValue &Lo, SDValue &Hi) { EVT VT = N->getValueType(0); SDLoc dl(N); SDValue Ops[2] = { N->getOperand(0), N->getOperand(1) }; if (TLI.getOperationAction(ISD::UDIVREM, VT) == TargetLowering::Custom) { SDValue Res = DAG.getNode(ISD::UDIVREM, dl, DAG.getVTList(VT, VT), Ops); SplitInteger(Res.getValue(1), Lo, Hi); return; } RTLIB::Libcall LC = RTLIB::UNKNOWN_LIBCALL; if (VT == MVT::i16) LC = RTLIB::UREM_I16; else if (VT == MVT::i32) LC = RTLIB::UREM_I32; else if (VT == MVT::i64) LC = RTLIB::UREM_I64; else if (VT == MVT::i128) LC = RTLIB::UREM_I128; assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported UREM!"); SplitInteger(TLI.makeLibCall(DAG, LC, VT, Ops, false, dl).first, Lo, Hi); } void DAGTypeLegalizer::ExpandIntRes_ZERO_EXTEND(SDNode *N, SDValue &Lo, SDValue &Hi) { EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0)); SDLoc dl(N); SDValue Op = N->getOperand(0); if (Op.getValueType().bitsLE(NVT)) { // The low part is zero extension of the input (degenerates to a copy). Lo = DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, N->getOperand(0)); Hi = DAG.getConstant(0, dl, NVT); // The high part is just a zero. } else { // For example, extension of an i48 to an i64. The operand type necessarily // promotes to the result type, so will end up being expanded too. assert(getTypeAction(Op.getValueType()) == TargetLowering::TypePromoteInteger && "Only know how to promote this result!"); SDValue Res = GetPromotedInteger(Op); assert(Res.getValueType() == N->getValueType(0) && "Operand over promoted?"); // Split the promoted operand. This will simplify when it is expanded. SplitInteger(Res, Lo, Hi); unsigned ExcessBits = Op.getValueType().getSizeInBits() - NVT.getSizeInBits(); Hi = DAG.getZeroExtendInReg(Hi, dl, EVT::getIntegerVT(*DAG.getContext(), ExcessBits)); } } void DAGTypeLegalizer::ExpandIntRes_ATOMIC_LOAD(SDNode *N, SDValue &Lo, SDValue &Hi) { SDLoc dl(N); EVT VT = cast(N)->getMemoryVT(); SDVTList VTs = DAG.getVTList(VT, MVT::i1, MVT::Other); SDValue Zero = DAG.getConstant(0, dl, VT); SDValue Swap = DAG.getAtomicCmpSwap( ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, dl, cast(N)->getMemoryVT(), VTs, N->getOperand(0), N->getOperand(1), Zero, Zero, cast(N)->getMemOperand(), cast(N)->getOrdering(), cast(N)->getOrdering(), cast(N)->getSynchScope()); ReplaceValueWith(SDValue(N, 0), Swap.getValue(0)); ReplaceValueWith(SDValue(N, 1), Swap.getValue(2)); } //===----------------------------------------------------------------------===// // Integer Operand Expansion //===----------------------------------------------------------------------===// /// ExpandIntegerOperand - This method is called when the specified operand of /// the specified node is found to need expansion. At this point, all of the /// result types of the node are known to be legal, but other operands of the /// node may need promotion or expansion as well as the specified one. bool DAGTypeLegalizer::ExpandIntegerOperand(SDNode *N, unsigned OpNo) { DEBUG(dbgs() << "Expand integer operand: "; N->dump(&DAG); dbgs() << "\n"); SDValue Res = SDValue(); if (CustomLowerNode(N, N->getOperand(OpNo).getValueType(), false)) return false; switch (N->getOpcode()) { default: #ifndef NDEBUG dbgs() << "ExpandIntegerOperand Op #" << OpNo << ": "; N->dump(&DAG); dbgs() << "\n"; #endif llvm_unreachable("Do not know how to expand this operator's operand!"); case ISD::BITCAST: Res = ExpandOp_BITCAST(N); break; case ISD::BR_CC: Res = ExpandIntOp_BR_CC(N); break; case ISD::BUILD_VECTOR: Res = ExpandOp_BUILD_VECTOR(N); break; case ISD::EXTRACT_ELEMENT: Res = ExpandOp_EXTRACT_ELEMENT(N); break; case ISD::INSERT_VECTOR_ELT: Res = ExpandOp_INSERT_VECTOR_ELT(N); break; case ISD::SCALAR_TO_VECTOR: Res = ExpandOp_SCALAR_TO_VECTOR(N); break; case ISD::SELECT_CC: Res = ExpandIntOp_SELECT_CC(N); break; case ISD::SETCC: Res = ExpandIntOp_SETCC(N); break; case ISD::SETCCE: Res = ExpandIntOp_SETCCE(N); break; case ISD::SINT_TO_FP: Res = ExpandIntOp_SINT_TO_FP(N); break; case ISD::STORE: Res = ExpandIntOp_STORE(cast(N), OpNo); break; case ISD::TRUNCATE: Res = ExpandIntOp_TRUNCATE(N); break; case ISD::UINT_TO_FP: Res = ExpandIntOp_UINT_TO_FP(N); break; case ISD::SHL: case ISD::SRA: case ISD::SRL: case ISD::ROTL: case ISD::ROTR: Res = ExpandIntOp_Shift(N); break; case ISD::RETURNADDR: case ISD::FRAMEADDR: Res = ExpandIntOp_RETURNADDR(N); break; case ISD::ATOMIC_STORE: Res = ExpandIntOp_ATOMIC_STORE(N); break; } // If the result is null, the sub-method took care of registering results etc. if (!Res.getNode()) return false; // If the result is N, the sub-method updated N in place. Tell the legalizer // core about this. if (Res.getNode() == N) return true; assert(Res.getValueType() == N->getValueType(0) && N->getNumValues() == 1 && "Invalid operand expansion"); ReplaceValueWith(SDValue(N, 0), Res); return false; } /// IntegerExpandSetCCOperands - Expand the operands of a comparison. This code /// is shared among BR_CC, SELECT_CC, and SETCC handlers. void DAGTypeLegalizer::IntegerExpandSetCCOperands(SDValue &NewLHS, SDValue &NewRHS, ISD::CondCode &CCCode, const SDLoc &dl) { SDValue LHSLo, LHSHi, RHSLo, RHSHi; GetExpandedInteger(NewLHS, LHSLo, LHSHi); GetExpandedInteger(NewRHS, RHSLo, RHSHi); if (CCCode == ISD::SETEQ || CCCode == ISD::SETNE) { if (RHSLo == RHSHi) { if (ConstantSDNode *RHSCST = dyn_cast(RHSLo)) { if (RHSCST->isAllOnesValue()) { // Equality comparison to -1. NewLHS = DAG.getNode(ISD::AND, dl, LHSLo.getValueType(), LHSLo, LHSHi); NewRHS = RHSLo; return; } } } NewLHS = DAG.getNode(ISD::XOR, dl, LHSLo.getValueType(), LHSLo, RHSLo); NewRHS = DAG.getNode(ISD::XOR, dl, LHSLo.getValueType(), LHSHi, RHSHi); NewLHS = DAG.getNode(ISD::OR, dl, NewLHS.getValueType(), NewLHS, NewRHS); NewRHS = DAG.getConstant(0, dl, NewLHS.getValueType()); return; } // If this is a comparison of the sign bit, just look at the top part. // X > -1, x < 0 if (ConstantSDNode *CST = dyn_cast(NewRHS)) if ((CCCode == ISD::SETLT && CST->isNullValue()) || // X < 0 (CCCode == ISD::SETGT && CST->isAllOnesValue())) { // X > -1 NewLHS = LHSHi; NewRHS = RHSHi; return; } // FIXME: This generated code sucks. ISD::CondCode LowCC; switch (CCCode) { default: llvm_unreachable("Unknown integer setcc!"); case ISD::SETLT: case ISD::SETULT: LowCC = ISD::SETULT; break; case ISD::SETGT: case ISD::SETUGT: LowCC = ISD::SETUGT; break; case ISD::SETLE: case ISD::SETULE: LowCC = ISD::SETULE; break; case ISD::SETGE: case ISD::SETUGE: LowCC = ISD::SETUGE; break; } // Tmp1 = lo(op1) < lo(op2) // Always unsigned comparison // Tmp2 = hi(op1) < hi(op2) // Signedness depends on operands // dest = hi(op1) == hi(op2) ? Tmp1 : Tmp2; // NOTE: on targets without efficient SELECT of bools, we can always use // this identity: (B1 ? B2 : B3) --> (B1 & B2)|(!B1&B3) TargetLowering::DAGCombinerInfo DagCombineInfo(DAG, AfterLegalizeTypes, true, nullptr); SDValue Tmp1, Tmp2; if (TLI.isTypeLegal(LHSLo.getValueType()) && TLI.isTypeLegal(RHSLo.getValueType())) Tmp1 = TLI.SimplifySetCC(getSetCCResultType(LHSLo.getValueType()), LHSLo, RHSLo, LowCC, false, DagCombineInfo, dl); if (!Tmp1.getNode()) Tmp1 = DAG.getSetCC(dl, getSetCCResultType(LHSLo.getValueType()), LHSLo, RHSLo, LowCC); if (TLI.isTypeLegal(LHSHi.getValueType()) && TLI.isTypeLegal(RHSHi.getValueType())) Tmp2 = TLI.SimplifySetCC(getSetCCResultType(LHSHi.getValueType()), LHSHi, RHSHi, CCCode, false, DagCombineInfo, dl); if (!Tmp2.getNode()) Tmp2 = DAG.getNode(ISD::SETCC, dl, getSetCCResultType(LHSHi.getValueType()), LHSHi, RHSHi, DAG.getCondCode(CCCode)); ConstantSDNode *Tmp1C = dyn_cast(Tmp1.getNode()); ConstantSDNode *Tmp2C = dyn_cast(Tmp2.getNode()); if ((Tmp1C && Tmp1C->isNullValue()) || (Tmp2C && Tmp2C->isNullValue() && (CCCode == ISD::SETLE || CCCode == ISD::SETGE || CCCode == ISD::SETUGE || CCCode == ISD::SETULE)) || (Tmp2C && Tmp2C->getAPIntValue() == 1 && (CCCode == ISD::SETLT || CCCode == ISD::SETGT || CCCode == ISD::SETUGT || CCCode == ISD::SETULT))) { // low part is known false, returns high part. // For LE / GE, if high part is known false, ignore the low part. // For LT / GT, if high part is known true, ignore the low part. NewLHS = Tmp2; NewRHS = SDValue(); return; } if (LHSHi == RHSHi) { // Comparing the low bits is enough. NewLHS = Tmp1; NewRHS = SDValue(); return; } // Lower with SETCCE if the target supports it. // FIXME: Make all targets support this, then remove the other lowering. if (TLI.getOperationAction( ISD::SETCCE, TLI.getTypeToExpandTo(*DAG.getContext(), LHSLo.getValueType())) == TargetLowering::Custom) { // SETCCE can detect < and >= directly. For > and <=, flip operands and // condition code. bool FlipOperands = false; switch (CCCode) { case ISD::SETGT: CCCode = ISD::SETLT; FlipOperands = true; break; case ISD::SETUGT: CCCode = ISD::SETULT; FlipOperands = true; break; case ISD::SETLE: CCCode = ISD::SETGE; FlipOperands = true; break; case ISD::SETULE: CCCode = ISD::SETUGE; FlipOperands = true; break; default: break; } if (FlipOperands) { std::swap(LHSLo, RHSLo); std::swap(LHSHi, RHSHi); } // Perform a wide subtraction, feeding the carry from the low part into // SETCCE. The SETCCE operation is essentially looking at the high part of // the result of LHS - RHS. It is negative iff LHS < RHS. It is zero or // positive iff LHS >= RHS. SDVTList VTList = DAG.getVTList(LHSLo.getValueType(), MVT::Glue); SDValue LowCmp = DAG.getNode(ISD::SUBC, dl, VTList, LHSLo, RHSLo); SDValue Res = DAG.getNode(ISD::SETCCE, dl, getSetCCResultType(LHSLo.getValueType()), LHSHi, RHSHi, LowCmp.getValue(1), DAG.getCondCode(CCCode)); NewLHS = Res; NewRHS = SDValue(); return; } NewLHS = TLI.SimplifySetCC(getSetCCResultType(LHSHi.getValueType()), LHSHi, RHSHi, ISD::SETEQ, false, DagCombineInfo, dl); if (!NewLHS.getNode()) NewLHS = DAG.getSetCC(dl, getSetCCResultType(LHSHi.getValueType()), LHSHi, RHSHi, ISD::SETEQ); NewLHS = DAG.getSelect(dl, Tmp1.getValueType(), NewLHS, Tmp1, Tmp2); NewRHS = SDValue(); } SDValue DAGTypeLegalizer::ExpandIntOp_BR_CC(SDNode *N) { SDValue NewLHS = N->getOperand(2), NewRHS = N->getOperand(3); ISD::CondCode CCCode = cast(N->getOperand(1))->get(); IntegerExpandSetCCOperands(NewLHS, NewRHS, CCCode, SDLoc(N)); // If ExpandSetCCOperands returned a scalar, we need to compare the result // against zero to select between true and false values. if (!NewRHS.getNode()) { NewRHS = DAG.getConstant(0, SDLoc(N), NewLHS.getValueType()); CCCode = ISD::SETNE; } // Update N to have the operands specified. return SDValue(DAG.UpdateNodeOperands(N, N->getOperand(0), DAG.getCondCode(CCCode), NewLHS, NewRHS, N->getOperand(4)), 0); } SDValue DAGTypeLegalizer::ExpandIntOp_SELECT_CC(SDNode *N) { SDValue NewLHS = N->getOperand(0), NewRHS = N->getOperand(1); ISD::CondCode CCCode = cast(N->getOperand(4))->get(); IntegerExpandSetCCOperands(NewLHS, NewRHS, CCCode, SDLoc(N)); // If ExpandSetCCOperands returned a scalar, we need to compare the result // against zero to select between true and false values. if (!NewRHS.getNode()) { NewRHS = DAG.getConstant(0, SDLoc(N), NewLHS.getValueType()); CCCode = ISD::SETNE; } // Update N to have the operands specified. return SDValue(DAG.UpdateNodeOperands(N, NewLHS, NewRHS, N->getOperand(2), N->getOperand(3), DAG.getCondCode(CCCode)), 0); } SDValue DAGTypeLegalizer::ExpandIntOp_SETCC(SDNode *N) { SDValue NewLHS = N->getOperand(0), NewRHS = N->getOperand(1); ISD::CondCode CCCode = cast(N->getOperand(2))->get(); IntegerExpandSetCCOperands(NewLHS, NewRHS, CCCode, SDLoc(N)); // If ExpandSetCCOperands returned a scalar, use it. if (!NewRHS.getNode()) { assert(NewLHS.getValueType() == N->getValueType(0) && "Unexpected setcc expansion!"); return NewLHS; } // Otherwise, update N to have the operands specified. return SDValue(DAG.UpdateNodeOperands(N, NewLHS, NewRHS, DAG.getCondCode(CCCode)), 0); } SDValue DAGTypeLegalizer::ExpandIntOp_SETCCE(SDNode *N) { SDValue LHS = N->getOperand(0); SDValue RHS = N->getOperand(1); SDValue Carry = N->getOperand(2); SDValue Cond = N->getOperand(3); SDLoc dl = SDLoc(N); SDValue LHSLo, LHSHi, RHSLo, RHSHi; GetExpandedInteger(LHS, LHSLo, LHSHi); GetExpandedInteger(RHS, RHSLo, RHSHi); // Expand to a SUBE for the low part and a smaller SETCCE for the high. SDVTList VTList = DAG.getVTList(LHSLo.getValueType(), MVT::Glue); SDValue LowCmp = DAG.getNode(ISD::SUBE, dl, VTList, LHSLo, RHSLo, Carry); return DAG.getNode(ISD::SETCCE, dl, N->getValueType(0), LHSHi, RHSHi, LowCmp.getValue(1), Cond); } SDValue DAGTypeLegalizer::ExpandIntOp_Shift(SDNode *N) { // The value being shifted is legal, but the shift amount is too big. // It follows that either the result of the shift is undefined, or the // upper half of the shift amount is zero. Just use the lower half. SDValue Lo, Hi; GetExpandedInteger(N->getOperand(1), Lo, Hi); return SDValue(DAG.UpdateNodeOperands(N, N->getOperand(0), Lo), 0); } SDValue DAGTypeLegalizer::ExpandIntOp_RETURNADDR(SDNode *N) { // The argument of RETURNADDR / FRAMEADDR builtin is 32 bit contant. This // surely makes pretty nice problems on 8/16 bit targets. Just truncate this // constant to valid type. SDValue Lo, Hi; GetExpandedInteger(N->getOperand(0), Lo, Hi); return SDValue(DAG.UpdateNodeOperands(N, Lo), 0); } SDValue DAGTypeLegalizer::ExpandIntOp_SINT_TO_FP(SDNode *N) { SDValue Op = N->getOperand(0); EVT DstVT = N->getValueType(0); RTLIB::Libcall LC = RTLIB::getSINTTOFP(Op.getValueType(), DstVT); assert(LC != RTLIB::UNKNOWN_LIBCALL && "Don't know how to expand this SINT_TO_FP!"); return TLI.makeLibCall(DAG, LC, DstVT, Op, true, SDLoc(N)).first; } SDValue DAGTypeLegalizer::ExpandIntOp_STORE(StoreSDNode *N, unsigned OpNo) { if (ISD::isNormalStore(N)) return ExpandOp_NormalStore(N, OpNo); assert(ISD::isUNINDEXEDStore(N) && "Indexed store during type legalization!"); assert(OpNo == 1 && "Can only expand the stored value so far"); EVT VT = N->getOperand(1).getValueType(); EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT); SDValue Ch = N->getChain(); SDValue Ptr = N->getBasePtr(); unsigned Alignment = N->getAlignment(); MachineMemOperand::Flags MMOFlags = N->getMemOperand()->getFlags(); AAMDNodes AAInfo = N->getAAInfo(); SDLoc dl(N); SDValue Lo, Hi; assert(NVT.isByteSized() && "Expanded type not byte sized!"); if (N->getMemoryVT().bitsLE(NVT)) { GetExpandedInteger(N->getValue(), Lo, Hi); return DAG.getTruncStore(Ch, dl, Lo, Ptr, N->getPointerInfo(), N->getMemoryVT(), Alignment, MMOFlags, AAInfo); } if (DAG.getDataLayout().isLittleEndian()) { // Little-endian - low bits are at low addresses. GetExpandedInteger(N->getValue(), Lo, Hi); Lo = DAG.getStore(Ch, dl, Lo, Ptr, N->getPointerInfo(), Alignment, MMOFlags, AAInfo); unsigned ExcessBits = N->getMemoryVT().getSizeInBits() - NVT.getSizeInBits(); EVT NEVT = EVT::getIntegerVT(*DAG.getContext(), ExcessBits); // Increment the pointer to the other half. unsigned IncrementSize = NVT.getSizeInBits()/8; Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, DAG.getConstant(IncrementSize, dl, Ptr.getValueType())); Hi = DAG.getTruncStore( Ch, dl, Hi, Ptr, N->getPointerInfo().getWithOffset(IncrementSize), NEVT, MinAlign(Alignment, IncrementSize), MMOFlags, AAInfo); return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo, Hi); } // Big-endian - high bits are at low addresses. Favor aligned stores at // the cost of some bit-fiddling. GetExpandedInteger(N->getValue(), Lo, Hi); EVT ExtVT = N->getMemoryVT(); unsigned EBytes = ExtVT.getStoreSize(); unsigned IncrementSize = NVT.getSizeInBits()/8; unsigned ExcessBits = (EBytes - IncrementSize)*8; EVT HiVT = EVT::getIntegerVT(*DAG.getContext(), ExtVT.getSizeInBits() - ExcessBits); if (ExcessBits < NVT.getSizeInBits()) { // Transfer high bits from the top of Lo to the bottom of Hi. Hi = DAG.getNode(ISD::SHL, dl, NVT, Hi, DAG.getConstant(NVT.getSizeInBits() - ExcessBits, dl, TLI.getPointerTy(DAG.getDataLayout()))); Hi = DAG.getNode( ISD::OR, dl, NVT, Hi, DAG.getNode(ISD::SRL, dl, NVT, Lo, DAG.getConstant(ExcessBits, dl, TLI.getPointerTy(DAG.getDataLayout())))); } // Store both the high bits and maybe some of the low bits. Hi = DAG.getTruncStore(Ch, dl, Hi, Ptr, N->getPointerInfo(), HiVT, Alignment, MMOFlags, AAInfo); // Increment the pointer to the other half. Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, DAG.getConstant(IncrementSize, dl, Ptr.getValueType())); // Store the lowest ExcessBits bits in the second half. Lo = DAG.getTruncStore(Ch, dl, Lo, Ptr, N->getPointerInfo().getWithOffset(IncrementSize), EVT::getIntegerVT(*DAG.getContext(), ExcessBits), MinAlign(Alignment, IncrementSize), MMOFlags, AAInfo); return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo, Hi); } SDValue DAGTypeLegalizer::ExpandIntOp_TRUNCATE(SDNode *N) { SDValue InL, InH; GetExpandedInteger(N->getOperand(0), InL, InH); // Just truncate the low part of the source. return DAG.getNode(ISD::TRUNCATE, SDLoc(N), N->getValueType(0), InL); } SDValue DAGTypeLegalizer::ExpandIntOp_UINT_TO_FP(SDNode *N) { SDValue Op = N->getOperand(0); EVT SrcVT = Op.getValueType(); EVT DstVT = N->getValueType(0); SDLoc dl(N); // The following optimization is valid only if every value in SrcVT (when // treated as signed) is representable in DstVT. Check that the mantissa // size of DstVT is >= than the number of bits in SrcVT -1. const fltSemantics &sem = DAG.EVTToAPFloatSemantics(DstVT); if (APFloat::semanticsPrecision(sem) >= SrcVT.getSizeInBits()-1 && TLI.getOperationAction(ISD::SINT_TO_FP, SrcVT) == TargetLowering::Custom){ // Do a signed conversion then adjust the result. SDValue SignedConv = DAG.getNode(ISD::SINT_TO_FP, dl, DstVT, Op); SignedConv = TLI.LowerOperation(SignedConv, DAG); // The result of the signed conversion needs adjusting if the 'sign bit' of // the incoming integer was set. To handle this, we dynamically test to see // if it is set, and, if so, add a fudge factor. const uint64_t F32TwoE32 = 0x4F800000ULL; const uint64_t F32TwoE64 = 0x5F800000ULL; const uint64_t F32TwoE128 = 0x7F800000ULL; APInt FF(32, 0); if (SrcVT == MVT::i32) FF = APInt(32, F32TwoE32); else if (SrcVT == MVT::i64) FF = APInt(32, F32TwoE64); else if (SrcVT == MVT::i128) FF = APInt(32, F32TwoE128); else llvm_unreachable("Unsupported UINT_TO_FP!"); // Check whether the sign bit is set. SDValue Lo, Hi; GetExpandedInteger(Op, Lo, Hi); SDValue SignSet = DAG.getSetCC(dl, getSetCCResultType(Hi.getValueType()), Hi, DAG.getConstant(0, dl, Hi.getValueType()), ISD::SETLT); // Build a 64 bit pair (0, FF) in the constant pool, with FF in the lo bits. SDValue FudgePtr = DAG.getConstantPool(ConstantInt::get(*DAG.getContext(), FF.zext(64)), TLI.getPointerTy(DAG.getDataLayout())); // Get a pointer to FF if the sign bit was set, or to 0 otherwise. SDValue Zero = DAG.getIntPtrConstant(0, dl); SDValue Four = DAG.getIntPtrConstant(4, dl); if (DAG.getDataLayout().isBigEndian()) std::swap(Zero, Four); SDValue Offset = DAG.getSelect(dl, Zero.getValueType(), SignSet, Zero, Four); unsigned Alignment = cast(FudgePtr)->getAlignment(); FudgePtr = DAG.getNode(ISD::ADD, dl, FudgePtr.getValueType(), FudgePtr, Offset); Alignment = std::min(Alignment, 4u); // Load the value out, extending it from f32 to the destination float type. // FIXME: Avoid the extend by constructing the right constant pool? SDValue Fudge = DAG.getExtLoad( ISD::EXTLOAD, dl, DstVT, DAG.getEntryNode(), FudgePtr, MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), MVT::f32, Alignment); return DAG.getNode(ISD::FADD, dl, DstVT, SignedConv, Fudge); } // Otherwise, use a libcall. RTLIB::Libcall LC = RTLIB::getUINTTOFP(SrcVT, DstVT); assert(LC != RTLIB::UNKNOWN_LIBCALL && "Don't know how to expand this UINT_TO_FP!"); return TLI.makeLibCall(DAG, LC, DstVT, Op, true, dl).first; } SDValue DAGTypeLegalizer::ExpandIntOp_ATOMIC_STORE(SDNode *N) { SDLoc dl(N); SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl, cast(N)->getMemoryVT(), N->getOperand(0), N->getOperand(1), N->getOperand(2), cast(N)->getMemOperand(), cast(N)->getOrdering(), cast(N)->getSynchScope()); return Swap.getValue(1); } SDValue DAGTypeLegalizer::PromoteIntRes_EXTRACT_SUBVECTOR(SDNode *N) { SDValue InOp0 = N->getOperand(0); EVT InVT = InOp0.getValueType(); EVT OutVT = N->getValueType(0); EVT NOutVT = TLI.getTypeToTransformTo(*DAG.getContext(), OutVT); assert(NOutVT.isVector() && "This type must be promoted to a vector type"); unsigned OutNumElems = OutVT.getVectorNumElements(); EVT NOutVTElem = NOutVT.getVectorElementType(); SDLoc dl(N); SDValue BaseIdx = N->getOperand(1); SmallVector Ops; Ops.reserve(OutNumElems); for (unsigned i = 0; i != OutNumElems; ++i) { // Extract the element from the original vector. SDValue Index = DAG.getNode(ISD::ADD, dl, BaseIdx.getValueType(), BaseIdx, DAG.getConstant(i, dl, BaseIdx.getValueType())); SDValue Ext = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, InVT.getVectorElementType(), N->getOperand(0), Index); SDValue Op = DAG.getNode(ISD::ANY_EXTEND, dl, NOutVTElem, Ext); // Insert the converted element to the new vector. Ops.push_back(Op); } return DAG.getNode(ISD::BUILD_VECTOR, dl, NOutVT, Ops); } SDValue DAGTypeLegalizer::PromoteIntRes_VECTOR_SHUFFLE(SDNode *N) { ShuffleVectorSDNode *SV = cast(N); EVT VT = N->getValueType(0); SDLoc dl(N); ArrayRef NewMask = SV->getMask().slice(0, VT.getVectorNumElements()); SDValue V0 = GetPromotedInteger(N->getOperand(0)); SDValue V1 = GetPromotedInteger(N->getOperand(1)); EVT OutVT = V0.getValueType(); return DAG.getVectorShuffle(OutVT, dl, V0, V1, NewMask); } SDValue DAGTypeLegalizer::PromoteIntRes_BUILD_VECTOR(SDNode *N) { EVT OutVT = N->getValueType(0); EVT NOutVT = TLI.getTypeToTransformTo(*DAG.getContext(), OutVT); assert(NOutVT.isVector() && "This type must be promoted to a vector type"); unsigned NumElems = N->getNumOperands(); EVT NOutVTElem = NOutVT.getVectorElementType(); SDLoc dl(N); SmallVector Ops; Ops.reserve(NumElems); for (unsigned i = 0; i != NumElems; ++i) { SDValue Op; // BUILD_VECTOR integer operand types are allowed to be larger than the // result's element type. This may still be true after the promotion. For // example, we might be promoting ( = BV , , ...) to // (v?i16 = BV , , ...), and we can't any_extend to . if (N->getOperand(i).getValueType().bitsLT(NOutVTElem)) Op = DAG.getNode(ISD::ANY_EXTEND, dl, NOutVTElem, N->getOperand(i)); else Op = N->getOperand(i); Ops.push_back(Op); } return DAG.getNode(ISD::BUILD_VECTOR, dl, NOutVT, Ops); } SDValue DAGTypeLegalizer::PromoteIntRes_SCALAR_TO_VECTOR(SDNode *N) { SDLoc dl(N); assert(!N->getOperand(0).getValueType().isVector() && "Input must be a scalar"); EVT OutVT = N->getValueType(0); EVT NOutVT = TLI.getTypeToTransformTo(*DAG.getContext(), OutVT); assert(NOutVT.isVector() && "This type must be promoted to a vector type"); EVT NOutVTElem = NOutVT.getVectorElementType(); SDValue Op = DAG.getNode(ISD::ANY_EXTEND, dl, NOutVTElem, N->getOperand(0)); return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, NOutVT, Op); } SDValue DAGTypeLegalizer::PromoteIntRes_CONCAT_VECTORS(SDNode *N) { SDLoc dl(N); EVT OutVT = N->getValueType(0); EVT NOutVT = TLI.getTypeToTransformTo(*DAG.getContext(), OutVT); assert(NOutVT.isVector() && "This type must be promoted to a vector type"); EVT InElemTy = OutVT.getVectorElementType(); EVT OutElemTy = NOutVT.getVectorElementType(); unsigned NumElem = N->getOperand(0).getValueType().getVectorNumElements(); unsigned NumOutElem = NOutVT.getVectorNumElements(); unsigned NumOperands = N->getNumOperands(); assert(NumElem * NumOperands == NumOutElem && "Unexpected number of elements"); // Take the elements from the first vector. SmallVector Ops(NumOutElem); for (unsigned i = 0; i < NumOperands; ++i) { SDValue Op = N->getOperand(i); for (unsigned j = 0; j < NumElem; ++j) { SDValue Ext = DAG.getNode( ISD::EXTRACT_VECTOR_ELT, dl, InElemTy, Op, DAG.getConstant(j, dl, TLI.getVectorIdxTy(DAG.getDataLayout()))); Ops[i * NumElem + j] = DAG.getNode(ISD::ANY_EXTEND, dl, OutElemTy, Ext); } } return DAG.getNode(ISD::BUILD_VECTOR, dl, NOutVT, Ops); } SDValue DAGTypeLegalizer::PromoteIntRes_INSERT_VECTOR_ELT(SDNode *N) { EVT OutVT = N->getValueType(0); EVT NOutVT = TLI.getTypeToTransformTo(*DAG.getContext(), OutVT); assert(NOutVT.isVector() && "This type must be promoted to a vector type"); EVT NOutVTElem = NOutVT.getVectorElementType(); SDLoc dl(N); SDValue V0 = GetPromotedInteger(N->getOperand(0)); SDValue ConvElem = DAG.getNode(ISD::ANY_EXTEND, dl, NOutVTElem, N->getOperand(1)); return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, NOutVT, V0, ConvElem, N->getOperand(2)); } SDValue DAGTypeLegalizer::PromoteIntOp_EXTRACT_VECTOR_ELT(SDNode *N) { SDLoc dl(N); SDValue V0 = GetPromotedInteger(N->getOperand(0)); SDValue V1 = DAG.getZExtOrTrunc(N->getOperand(1), dl, TLI.getVectorIdxTy(DAG.getDataLayout())); SDValue Ext = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, V0->getValueType(0).getScalarType(), V0, V1); // EXTRACT_VECTOR_ELT can return types which are wider than the incoming // element types. If this is the case then we need to expand the outgoing // value and not truncate it. return DAG.getAnyExtOrTrunc(Ext, dl, N->getValueType(0)); } SDValue DAGTypeLegalizer::PromoteIntOp_EXTRACT_SUBVECTOR(SDNode *N) { SDLoc dl(N); SDValue V0 = GetPromotedInteger(N->getOperand(0)); MVT InVT = V0.getValueType().getSimpleVT(); MVT OutVT = MVT::getVectorVT(InVT.getVectorElementType(), N->getValueType(0).getVectorNumElements()); SDValue Ext = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OutVT, V0, N->getOperand(1)); return DAG.getNode(ISD::TRUNCATE, dl, N->getValueType(0), Ext); } SDValue DAGTypeLegalizer::PromoteIntOp_CONCAT_VECTORS(SDNode *N) { SDLoc dl(N); unsigned NumElems = N->getNumOperands(); EVT RetSclrTy = N->getValueType(0).getVectorElementType(); SmallVector NewOps; NewOps.reserve(NumElems); // For each incoming vector for (unsigned VecIdx = 0; VecIdx != NumElems; ++VecIdx) { SDValue Incoming = GetPromotedInteger(N->getOperand(VecIdx)); EVT SclrTy = Incoming->getValueType(0).getVectorElementType(); unsigned NumElem = Incoming->getValueType(0).getVectorNumElements(); for (unsigned i=0; igetValueType(0), NewOps); } Index: projects/clang391-import/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp =================================================================== --- projects/clang391-import/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp (revision 309436) +++ projects/clang391-import/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp (revision 309437) @@ -1,3166 +1,3174 @@ //===-- SIInstrInfo.cpp - SI Instruction Information ---------------------===// // // The LLVM Compiler Infrastructure // // This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// // /// \file /// \brief SI Implementation of TargetInstrInfo. // //===----------------------------------------------------------------------===// #include "SIInstrInfo.h" #include "AMDGPUTargetMachine.h" #include "GCNHazardRecognizer.h" #include "SIDefines.h" #include "SIMachineFunctionInfo.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/ScheduleDAG.h" #include "llvm/IR/Function.h" #include "llvm/CodeGen/RegisterScavenging.h" #include "llvm/MC/MCInstrDesc.h" #include "llvm/Support/Debug.h" using namespace llvm; SIInstrInfo::SIInstrInfo(const SISubtarget &ST) : AMDGPUInstrInfo(ST), RI(), ST(ST) {} //===----------------------------------------------------------------------===// // TargetInstrInfo callbacks //===----------------------------------------------------------------------===// static unsigned getNumOperandsNoGlue(SDNode *Node) { unsigned N = Node->getNumOperands(); while (N && Node->getOperand(N - 1).getValueType() == MVT::Glue) --N; return N; } static SDValue findChainOperand(SDNode *Load) { SDValue LastOp = Load->getOperand(getNumOperandsNoGlue(Load) - 1); assert(LastOp.getValueType() == MVT::Other && "Chain missing from load node"); return LastOp; } /// \brief Returns true if both nodes have the same value for the given /// operand \p Op, or if both nodes do not have this operand. static bool nodesHaveSameOperandValue(SDNode *N0, SDNode* N1, unsigned OpName) { unsigned Opc0 = N0->getMachineOpcode(); unsigned Opc1 = N1->getMachineOpcode(); int Op0Idx = AMDGPU::getNamedOperandIdx(Opc0, OpName); int Op1Idx = AMDGPU::getNamedOperandIdx(Opc1, OpName); if (Op0Idx == -1 && Op1Idx == -1) return true; if ((Op0Idx == -1 && Op1Idx != -1) || (Op1Idx == -1 && Op0Idx != -1)) return false; // getNamedOperandIdx returns the index for the MachineInstr's operands, // which includes the result as the first operand. We are indexing into the // MachineSDNode's operands, so we need to skip the result operand to get // the real index. --Op0Idx; --Op1Idx; return N0->getOperand(Op0Idx) == N1->getOperand(Op1Idx); } bool SIInstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI, AliasAnalysis *AA) const { // TODO: The generic check fails for VALU instructions that should be // rematerializable due to implicit reads of exec. We really want all of the // generic logic for this except for this. switch (MI.getOpcode()) { case AMDGPU::V_MOV_B32_e32: case AMDGPU::V_MOV_B32_e64: case AMDGPU::V_MOV_B64_PSEUDO: return true; default: return false; } } bool SIInstrInfo::areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1, int64_t &Offset0, int64_t &Offset1) const { if (!Load0->isMachineOpcode() || !Load1->isMachineOpcode()) return false; unsigned Opc0 = Load0->getMachineOpcode(); unsigned Opc1 = Load1->getMachineOpcode(); // Make sure both are actually loads. if (!get(Opc0).mayLoad() || !get(Opc1).mayLoad()) return false; if (isDS(Opc0) && isDS(Opc1)) { // FIXME: Handle this case: if (getNumOperandsNoGlue(Load0) != getNumOperandsNoGlue(Load1)) return false; // Check base reg. if (Load0->getOperand(1) != Load1->getOperand(1)) return false; // Check chain. if (findChainOperand(Load0) != findChainOperand(Load1)) return false; // Skip read2 / write2 variants for simplicity. // TODO: We should report true if the used offsets are adjacent (excluded // st64 versions). if (AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::data1) != -1 || AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::data1) != -1) return false; Offset0 = cast(Load0->getOperand(2))->getZExtValue(); Offset1 = cast(Load1->getOperand(2))->getZExtValue(); return true; } if (isSMRD(Opc0) && isSMRD(Opc1)) { assert(getNumOperandsNoGlue(Load0) == getNumOperandsNoGlue(Load1)); // Check base reg. if (Load0->getOperand(0) != Load1->getOperand(0)) return false; const ConstantSDNode *Load0Offset = dyn_cast(Load0->getOperand(1)); const ConstantSDNode *Load1Offset = dyn_cast(Load1->getOperand(1)); if (!Load0Offset || !Load1Offset) return false; // Check chain. if (findChainOperand(Load0) != findChainOperand(Load1)) return false; Offset0 = Load0Offset->getZExtValue(); Offset1 = Load1Offset->getZExtValue(); return true; } // MUBUF and MTBUF can access the same addresses. if ((isMUBUF(Opc0) || isMTBUF(Opc0)) && (isMUBUF(Opc1) || isMTBUF(Opc1))) { // MUBUF and MTBUF have vaddr at different indices. if (!nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::soffset) || findChainOperand(Load0) != findChainOperand(Load1) || !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::vaddr) || !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::srsrc)) return false; int OffIdx0 = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset); int OffIdx1 = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset); if (OffIdx0 == -1 || OffIdx1 == -1) return false; // getNamedOperandIdx returns the index for MachineInstrs. Since they // inlcude the output in the operand list, but SDNodes don't, we need to // subtract the index by one. --OffIdx0; --OffIdx1; SDValue Off0 = Load0->getOperand(OffIdx0); SDValue Off1 = Load1->getOperand(OffIdx1); // The offset might be a FrameIndexSDNode. if (!isa(Off0) || !isa(Off1)) return false; Offset0 = cast(Off0)->getZExtValue(); Offset1 = cast(Off1)->getZExtValue(); return true; } return false; } static bool isStride64(unsigned Opc) { switch (Opc) { case AMDGPU::DS_READ2ST64_B32: case AMDGPU::DS_READ2ST64_B64: case AMDGPU::DS_WRITE2ST64_B32: case AMDGPU::DS_WRITE2ST64_B64: return true; default: return false; } } bool SIInstrInfo::getMemOpBaseRegImmOfs(MachineInstr &LdSt, unsigned &BaseReg, int64_t &Offset, const TargetRegisterInfo *TRI) const { unsigned Opc = LdSt.getOpcode(); if (isDS(LdSt)) { const MachineOperand *OffsetImm = getNamedOperand(LdSt, AMDGPU::OpName::offset); if (OffsetImm) { // Normal, single offset LDS instruction. const MachineOperand *AddrReg = getNamedOperand(LdSt, AMDGPU::OpName::addr); BaseReg = AddrReg->getReg(); Offset = OffsetImm->getImm(); return true; } // The 2 offset instructions use offset0 and offset1 instead. We can treat // these as a load with a single offset if the 2 offsets are consecutive. We // will use this for some partially aligned loads. const MachineOperand *Offset0Imm = getNamedOperand(LdSt, AMDGPU::OpName::offset0); const MachineOperand *Offset1Imm = getNamedOperand(LdSt, AMDGPU::OpName::offset1); uint8_t Offset0 = Offset0Imm->getImm(); uint8_t Offset1 = Offset1Imm->getImm(); if (Offset1 > Offset0 && Offset1 - Offset0 == 1) { // Each of these offsets is in element sized units, so we need to convert // to bytes of the individual reads. unsigned EltSize; if (LdSt.mayLoad()) EltSize = getOpRegClass(LdSt, 0)->getSize() / 2; else { assert(LdSt.mayStore()); int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0); EltSize = getOpRegClass(LdSt, Data0Idx)->getSize(); } if (isStride64(Opc)) EltSize *= 64; const MachineOperand *AddrReg = getNamedOperand(LdSt, AMDGPU::OpName::addr); BaseReg = AddrReg->getReg(); Offset = EltSize * Offset0; return true; } return false; } if (isMUBUF(LdSt) || isMTBUF(LdSt)) { if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::soffset) != -1) return false; const MachineOperand *AddrReg = getNamedOperand(LdSt, AMDGPU::OpName::vaddr); if (!AddrReg) return false; const MachineOperand *OffsetImm = getNamedOperand(LdSt, AMDGPU::OpName::offset); BaseReg = AddrReg->getReg(); Offset = OffsetImm->getImm(); return true; } if (isSMRD(LdSt)) { const MachineOperand *OffsetImm = getNamedOperand(LdSt, AMDGPU::OpName::offset); if (!OffsetImm) return false; const MachineOperand *SBaseReg = getNamedOperand(LdSt, AMDGPU::OpName::sbase); BaseReg = SBaseReg->getReg(); Offset = OffsetImm->getImm(); return true; } if (isFLAT(LdSt)) { const MachineOperand *AddrReg = getNamedOperand(LdSt, AMDGPU::OpName::addr); BaseReg = AddrReg->getReg(); Offset = 0; return true; } return false; } bool SIInstrInfo::shouldClusterMemOps(MachineInstr &FirstLdSt, MachineInstr &SecondLdSt, unsigned NumLoads) const { const MachineOperand *FirstDst = nullptr; const MachineOperand *SecondDst = nullptr; if (isDS(FirstLdSt) && isDS(SecondLdSt)) { FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdst); SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdst); } if (isSMRD(FirstLdSt) && isSMRD(SecondLdSt)) { FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::sdst); SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::sdst); } if ((isMUBUF(FirstLdSt) && isMUBUF(SecondLdSt)) || (isMTBUF(FirstLdSt) && isMTBUF(SecondLdSt))) { FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdata); SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdata); } if (!FirstDst || !SecondDst) return false; // Try to limit clustering based on the total number of bytes loaded // rather than the number of instructions. This is done to help reduce // register pressure. The method used is somewhat inexact, though, // because it assumes that all loads in the cluster will load the // same number of bytes as FirstLdSt. // The unit of this value is bytes. // FIXME: This needs finer tuning. unsigned LoadClusterThreshold = 16; const MachineRegisterInfo &MRI = FirstLdSt.getParent()->getParent()->getRegInfo(); const TargetRegisterClass *DstRC = MRI.getRegClass(FirstDst->getReg()); return (NumLoads * DstRC->getSize()) <= LoadClusterThreshold; } void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, unsigned DestReg, unsigned SrcReg, bool KillSrc) const { // If we are trying to copy to or from SCC, there is a bug somewhere else in // the backend. While it may be theoretically possible to do this, it should // never be necessary. assert(DestReg != AMDGPU::SCC && SrcReg != AMDGPU::SCC); static const int16_t Sub0_15[] = { AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11, AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15, }; static const int16_t Sub0_15_64[] = { AMDGPU::sub0_sub1, AMDGPU::sub2_sub3, AMDGPU::sub4_sub5, AMDGPU::sub6_sub7, AMDGPU::sub8_sub9, AMDGPU::sub10_sub11, AMDGPU::sub12_sub13, AMDGPU::sub14_sub15, }; static const int16_t Sub0_7[] = { AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, }; static const int16_t Sub0_7_64[] = { AMDGPU::sub0_sub1, AMDGPU::sub2_sub3, AMDGPU::sub4_sub5, AMDGPU::sub6_sub7, }; static const int16_t Sub0_3[] = { AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, }; static const int16_t Sub0_3_64[] = { AMDGPU::sub0_sub1, AMDGPU::sub2_sub3, }; static const int16_t Sub0_2[] = { AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, }; static const int16_t Sub0_1[] = { AMDGPU::sub0, AMDGPU::sub1, }; unsigned Opcode; ArrayRef SubIndices; if (AMDGPU::SReg_32RegClass.contains(DestReg)) { assert(AMDGPU::SReg_32RegClass.contains(SrcReg)); BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg) .addReg(SrcReg, getKillRegState(KillSrc)); return; } else if (AMDGPU::SReg_64RegClass.contains(DestReg)) { if (DestReg == AMDGPU::VCC) { if (AMDGPU::SReg_64RegClass.contains(SrcReg)) { BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), AMDGPU::VCC) .addReg(SrcReg, getKillRegState(KillSrc)); } else { // FIXME: Hack until VReg_1 removed. assert(AMDGPU::VGPR_32RegClass.contains(SrcReg)); BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_I32_e32)) .addImm(0) .addReg(SrcReg, getKillRegState(KillSrc)); } return; } assert(AMDGPU::SReg_64RegClass.contains(SrcReg)); BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg) .addReg(SrcReg, getKillRegState(KillSrc)); return; } else if (AMDGPU::SReg_128RegClass.contains(DestReg)) { assert(AMDGPU::SReg_128RegClass.contains(SrcReg)); Opcode = AMDGPU::S_MOV_B64; SubIndices = Sub0_3_64; } else if (AMDGPU::SReg_256RegClass.contains(DestReg)) { assert(AMDGPU::SReg_256RegClass.contains(SrcReg)); Opcode = AMDGPU::S_MOV_B64; SubIndices = Sub0_7_64; } else if (AMDGPU::SReg_512RegClass.contains(DestReg)) { assert(AMDGPU::SReg_512RegClass.contains(SrcReg)); Opcode = AMDGPU::S_MOV_B64; SubIndices = Sub0_15_64; } else if (AMDGPU::VGPR_32RegClass.contains(DestReg)) { assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) || AMDGPU::SReg_32RegClass.contains(SrcReg)); BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg) .addReg(SrcReg, getKillRegState(KillSrc)); return; } else if (AMDGPU::VReg_64RegClass.contains(DestReg)) { assert(AMDGPU::VReg_64RegClass.contains(SrcReg) || AMDGPU::SReg_64RegClass.contains(SrcReg)); Opcode = AMDGPU::V_MOV_B32_e32; SubIndices = Sub0_1; } else if (AMDGPU::VReg_96RegClass.contains(DestReg)) { assert(AMDGPU::VReg_96RegClass.contains(SrcReg)); Opcode = AMDGPU::V_MOV_B32_e32; SubIndices = Sub0_2; } else if (AMDGPU::VReg_128RegClass.contains(DestReg)) { assert(AMDGPU::VReg_128RegClass.contains(SrcReg) || AMDGPU::SReg_128RegClass.contains(SrcReg)); Opcode = AMDGPU::V_MOV_B32_e32; SubIndices = Sub0_3; } else if (AMDGPU::VReg_256RegClass.contains(DestReg)) { assert(AMDGPU::VReg_256RegClass.contains(SrcReg) || AMDGPU::SReg_256RegClass.contains(SrcReg)); Opcode = AMDGPU::V_MOV_B32_e32; SubIndices = Sub0_7; } else if (AMDGPU::VReg_512RegClass.contains(DestReg)) { assert(AMDGPU::VReg_512RegClass.contains(SrcReg) || AMDGPU::SReg_512RegClass.contains(SrcReg)); Opcode = AMDGPU::V_MOV_B32_e32; SubIndices = Sub0_15; } else { llvm_unreachable("Can't copy register!"); } bool Forward = RI.getHWRegIndex(DestReg) <= RI.getHWRegIndex(SrcReg); for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) { unsigned SubIdx; if (Forward) SubIdx = SubIndices[Idx]; else SubIdx = SubIndices[SubIndices.size() - Idx - 1]; MachineInstrBuilder Builder = BuildMI(MBB, MI, DL, get(Opcode), RI.getSubReg(DestReg, SubIdx)); Builder.addReg(RI.getSubReg(SrcReg, SubIdx)); if (Idx == SubIndices.size() - 1) Builder.addReg(SrcReg, getKillRegState(KillSrc) | RegState::Implicit); if (Idx == 0) Builder.addReg(DestReg, RegState::Define | RegState::Implicit); Builder.addReg(SrcReg, RegState::Implicit); } } int SIInstrInfo::commuteOpcode(const MachineInstr &MI) const { const unsigned Opcode = MI.getOpcode(); int NewOpc; // Try to map original to commuted opcode NewOpc = AMDGPU::getCommuteRev(Opcode); if (NewOpc != -1) // Check if the commuted (REV) opcode exists on the target. return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1; // Try to map commuted to original opcode NewOpc = AMDGPU::getCommuteOrig(Opcode); if (NewOpc != -1) // Check if the original (non-REV) opcode exists on the target. return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1; return Opcode; } unsigned SIInstrInfo::getMovOpcode(const TargetRegisterClass *DstRC) const { if (DstRC->getSize() == 4) { return RI.isSGPRClass(DstRC) ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32; } else if (DstRC->getSize() == 8 && RI.isSGPRClass(DstRC)) { return AMDGPU::S_MOV_B64; } else if (DstRC->getSize() == 8 && !RI.isSGPRClass(DstRC)) { return AMDGPU::V_MOV_B64_PSEUDO; } return AMDGPU::COPY; } static unsigned getSGPRSpillSaveOpcode(unsigned Size) { switch (Size) { case 4: return AMDGPU::SI_SPILL_S32_SAVE; case 8: return AMDGPU::SI_SPILL_S64_SAVE; case 16: return AMDGPU::SI_SPILL_S128_SAVE; case 32: return AMDGPU::SI_SPILL_S256_SAVE; case 64: return AMDGPU::SI_SPILL_S512_SAVE; default: llvm_unreachable("unknown register size"); } } static unsigned getVGPRSpillSaveOpcode(unsigned Size) { switch (Size) { case 4: return AMDGPU::SI_SPILL_V32_SAVE; case 8: return AMDGPU::SI_SPILL_V64_SAVE; case 12: return AMDGPU::SI_SPILL_V96_SAVE; case 16: return AMDGPU::SI_SPILL_V128_SAVE; case 32: return AMDGPU::SI_SPILL_V256_SAVE; case 64: return AMDGPU::SI_SPILL_V512_SAVE; default: llvm_unreachable("unknown register size"); } } void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, unsigned SrcReg, bool isKill, int FrameIndex, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI) const { MachineFunction *MF = MBB.getParent(); SIMachineFunctionInfo *MFI = MF->getInfo(); MachineFrameInfo *FrameInfo = MF->getFrameInfo(); DebugLoc DL = MBB.findDebugLoc(MI); unsigned Size = FrameInfo->getObjectSize(FrameIndex); unsigned Align = FrameInfo->getObjectAlignment(FrameIndex); MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(*MF, FrameIndex); MachineMemOperand *MMO = MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore, Size, Align); if (RI.isSGPRClass(RC)) { MFI->setHasSpilledSGPRs(); if (TargetRegisterInfo::isVirtualRegister(SrcReg) && RC->getSize() == 4) { // m0 may not be allowed for readlane. MachineRegisterInfo &MRI = MF->getRegInfo(); MRI.constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0RegClass); } // We are only allowed to create one new instruction when spilling // registers, so we need to use pseudo instruction for spilling // SGPRs. unsigned Opcode = getSGPRSpillSaveOpcode(RC->getSize()); BuildMI(MBB, MI, DL, get(Opcode)) .addReg(SrcReg, getKillRegState(isKill)) // src .addFrameIndex(FrameIndex) // frame_idx .addMemOperand(MMO); return; } if (!ST.isVGPRSpillingEnabled(*MF->getFunction())) { LLVMContext &Ctx = MF->getFunction()->getContext(); Ctx.emitError("SIInstrInfo::storeRegToStackSlot - Do not know how to" " spill register"); BuildMI(MBB, MI, DL, get(AMDGPU::KILL)) .addReg(SrcReg); return; } assert(RI.hasVGPRs(RC) && "Only VGPR spilling expected"); unsigned Opcode = getVGPRSpillSaveOpcode(RC->getSize()); MFI->setHasSpilledVGPRs(); BuildMI(MBB, MI, DL, get(Opcode)) .addReg(SrcReg, getKillRegState(isKill)) // src .addFrameIndex(FrameIndex) // frame_idx .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc .addReg(MFI->getScratchWaveOffsetReg()) // scratch_offset .addImm(0) // offset .addMemOperand(MMO); } static unsigned getSGPRSpillRestoreOpcode(unsigned Size) { switch (Size) { case 4: return AMDGPU::SI_SPILL_S32_RESTORE; case 8: return AMDGPU::SI_SPILL_S64_RESTORE; case 16: return AMDGPU::SI_SPILL_S128_RESTORE; case 32: return AMDGPU::SI_SPILL_S256_RESTORE; case 64: return AMDGPU::SI_SPILL_S512_RESTORE; default: llvm_unreachable("unknown register size"); } } static unsigned getVGPRSpillRestoreOpcode(unsigned Size) { switch (Size) { case 4: return AMDGPU::SI_SPILL_V32_RESTORE; case 8: return AMDGPU::SI_SPILL_V64_RESTORE; case 12: return AMDGPU::SI_SPILL_V96_RESTORE; case 16: return AMDGPU::SI_SPILL_V128_RESTORE; case 32: return AMDGPU::SI_SPILL_V256_RESTORE; case 64: return AMDGPU::SI_SPILL_V512_RESTORE; default: llvm_unreachable("unknown register size"); } } void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, unsigned DestReg, int FrameIndex, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI) const { MachineFunction *MF = MBB.getParent(); const SIMachineFunctionInfo *MFI = MF->getInfo(); MachineFrameInfo *FrameInfo = MF->getFrameInfo(); DebugLoc DL = MBB.findDebugLoc(MI); unsigned Align = FrameInfo->getObjectAlignment(FrameIndex); unsigned Size = FrameInfo->getObjectSize(FrameIndex); MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(*MF, FrameIndex); MachineMemOperand *MMO = MF->getMachineMemOperand( PtrInfo, MachineMemOperand::MOLoad, Size, Align); if (RI.isSGPRClass(RC)) { // FIXME: Maybe this should not include a memoperand because it will be // lowered to non-memory instructions. unsigned Opcode = getSGPRSpillRestoreOpcode(RC->getSize()); if (TargetRegisterInfo::isVirtualRegister(DestReg) && RC->getSize() == 4) { // m0 may not be allowed for readlane. MachineRegisterInfo &MRI = MF->getRegInfo(); MRI.constrainRegClass(DestReg, &AMDGPU::SReg_32_XM0RegClass); } BuildMI(MBB, MI, DL, get(Opcode), DestReg) .addFrameIndex(FrameIndex) // frame_idx .addMemOperand(MMO); return; } if (!ST.isVGPRSpillingEnabled(*MF->getFunction())) { LLVMContext &Ctx = MF->getFunction()->getContext(); Ctx.emitError("SIInstrInfo::loadRegFromStackSlot - Do not know how to" " restore register"); BuildMI(MBB, MI, DL, get(AMDGPU::IMPLICIT_DEF), DestReg); return; } assert(RI.hasVGPRs(RC) && "Only VGPR spilling expected"); unsigned Opcode = getVGPRSpillRestoreOpcode(RC->getSize()); BuildMI(MBB, MI, DL, get(Opcode), DestReg) .addFrameIndex(FrameIndex) // frame_idx .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc .addReg(MFI->getScratchWaveOffsetReg()) // scratch_offset .addImm(0) // offset .addMemOperand(MMO); } /// \param @Offset Offset in bytes of the FrameIndex being spilled unsigned SIInstrInfo::calculateLDSSpillAddress( MachineBasicBlock &MBB, MachineInstr &MI, RegScavenger *RS, unsigned TmpReg, unsigned FrameOffset, unsigned Size) const { MachineFunction *MF = MBB.getParent(); SIMachineFunctionInfo *MFI = MF->getInfo(); const SISubtarget &ST = MF->getSubtarget(); const SIRegisterInfo *TRI = ST.getRegisterInfo(); DebugLoc DL = MBB.findDebugLoc(MI); unsigned WorkGroupSize = MFI->getMaximumWorkGroupSize(*MF); unsigned WavefrontSize = ST.getWavefrontSize(); unsigned TIDReg = MFI->getTIDReg(); if (!MFI->hasCalculatedTID()) { MachineBasicBlock &Entry = MBB.getParent()->front(); MachineBasicBlock::iterator Insert = Entry.front(); DebugLoc DL = Insert->getDebugLoc(); TIDReg = RI.findUnusedRegister(MF->getRegInfo(), &AMDGPU::VGPR_32RegClass, *MF); if (TIDReg == AMDGPU::NoRegister) return TIDReg; if (!AMDGPU::isShader(MF->getFunction()->getCallingConv()) && WorkGroupSize > WavefrontSize) { unsigned TIDIGXReg = TRI->getPreloadedValue(*MF, SIRegisterInfo::WORKGROUP_ID_X); unsigned TIDIGYReg = TRI->getPreloadedValue(*MF, SIRegisterInfo::WORKGROUP_ID_Y); unsigned TIDIGZReg = TRI->getPreloadedValue(*MF, SIRegisterInfo::WORKGROUP_ID_Z); unsigned InputPtrReg = TRI->getPreloadedValue(*MF, SIRegisterInfo::KERNARG_SEGMENT_PTR); for (unsigned Reg : {TIDIGXReg, TIDIGYReg, TIDIGZReg}) { if (!Entry.isLiveIn(Reg)) Entry.addLiveIn(Reg); } RS->enterBasicBlock(Entry); // FIXME: Can we scavenge an SReg_64 and access the subregs? unsigned STmp0 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0); unsigned STmp1 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0); BuildMI(Entry, Insert, DL, get(AMDGPU::S_LOAD_DWORD_IMM), STmp0) .addReg(InputPtrReg) .addImm(SI::KernelInputOffsets::NGROUPS_Z); BuildMI(Entry, Insert, DL, get(AMDGPU::S_LOAD_DWORD_IMM), STmp1) .addReg(InputPtrReg) .addImm(SI::KernelInputOffsets::NGROUPS_Y); // NGROUPS.X * NGROUPS.Y BuildMI(Entry, Insert, DL, get(AMDGPU::S_MUL_I32), STmp1) .addReg(STmp1) .addReg(STmp0); // (NGROUPS.X * NGROUPS.Y) * TIDIG.X BuildMI(Entry, Insert, DL, get(AMDGPU::V_MUL_U32_U24_e32), TIDReg) .addReg(STmp1) .addReg(TIDIGXReg); // NGROUPS.Z * TIDIG.Y + (NGROUPS.X * NGROPUS.Y * TIDIG.X) BuildMI(Entry, Insert, DL, get(AMDGPU::V_MAD_U32_U24), TIDReg) .addReg(STmp0) .addReg(TIDIGYReg) .addReg(TIDReg); // (NGROUPS.Z * TIDIG.Y + (NGROUPS.X * NGROPUS.Y * TIDIG.X)) + TIDIG.Z BuildMI(Entry, Insert, DL, get(AMDGPU::V_ADD_I32_e32), TIDReg) .addReg(TIDReg) .addReg(TIDIGZReg); } else { // Get the wave id BuildMI(Entry, Insert, DL, get(AMDGPU::V_MBCNT_LO_U32_B32_e64), TIDReg) .addImm(-1) .addImm(0); BuildMI(Entry, Insert, DL, get(AMDGPU::V_MBCNT_HI_U32_B32_e64), TIDReg) .addImm(-1) .addReg(TIDReg); } BuildMI(Entry, Insert, DL, get(AMDGPU::V_LSHLREV_B32_e32), TIDReg) .addImm(2) .addReg(TIDReg); MFI->setTIDReg(TIDReg); } // Add FrameIndex to LDS offset unsigned LDSOffset = MFI->LDSSize + (FrameOffset * WorkGroupSize); BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_I32_e32), TmpReg) .addImm(LDSOffset) .addReg(TIDReg); return TmpReg; } void SIInstrInfo::insertWaitStates(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, int Count) const { DebugLoc DL = MBB.findDebugLoc(MI); while (Count > 0) { int Arg; if (Count >= 8) Arg = 7; else Arg = Count - 1; Count -= 8; BuildMI(MBB, MI, DL, get(AMDGPU::S_NOP)) .addImm(Arg); } } void SIInstrInfo::insertNoop(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI) const { insertWaitStates(MBB, MI, 1); } unsigned SIInstrInfo::getNumWaitStates(const MachineInstr &MI) const { switch (MI.getOpcode()) { default: return 1; // FIXME: Do wait states equal cycles? case AMDGPU::S_NOP: return MI.getOperand(0).getImm() + 1; } } bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { MachineBasicBlock &MBB = *MI.getParent(); DebugLoc DL = MBB.findDebugLoc(MI); switch (MI.getOpcode()) { default: return AMDGPUInstrInfo::expandPostRAPseudo(MI); case AMDGPU::V_MOV_B64_PSEUDO: { unsigned Dst = MI.getOperand(0).getReg(); unsigned DstLo = RI.getSubReg(Dst, AMDGPU::sub0); unsigned DstHi = RI.getSubReg(Dst, AMDGPU::sub1); const MachineOperand &SrcOp = MI.getOperand(1); // FIXME: Will this work for 64-bit floating point immediates? assert(!SrcOp.isFPImm()); if (SrcOp.isImm()) { APInt Imm(64, SrcOp.getImm()); BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo) .addImm(Imm.getLoBits(32).getZExtValue()) .addReg(Dst, RegState::Implicit | RegState::Define); BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi) .addImm(Imm.getHiBits(32).getZExtValue()) .addReg(Dst, RegState::Implicit | RegState::Define); } else { assert(SrcOp.isReg()); BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo) .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub0)) .addReg(Dst, RegState::Implicit | RegState::Define); BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi) .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub1)) .addReg(Dst, RegState::Implicit | RegState::Define); } MI.eraseFromParent(); break; } case AMDGPU::V_CNDMASK_B64_PSEUDO: { unsigned Dst = MI.getOperand(0).getReg(); unsigned DstLo = RI.getSubReg(Dst, AMDGPU::sub0); unsigned DstHi = RI.getSubReg(Dst, AMDGPU::sub1); unsigned Src0 = MI.getOperand(1).getReg(); unsigned Src1 = MI.getOperand(2).getReg(); const MachineOperand &SrcCond = MI.getOperand(3); BuildMI(MBB, MI, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstLo) .addReg(RI.getSubReg(Src0, AMDGPU::sub0)) .addReg(RI.getSubReg(Src1, AMDGPU::sub0)) .addReg(SrcCond.getReg()) .addReg(Dst, RegState::Implicit | RegState::Define); BuildMI(MBB, MI, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstHi) .addReg(RI.getSubReg(Src0, AMDGPU::sub1)) .addReg(RI.getSubReg(Src1, AMDGPU::sub1)) .addReg(SrcCond.getReg(), getKillRegState(SrcCond.isKill())) .addReg(Dst, RegState::Implicit | RegState::Define); MI.eraseFromParent(); break; } case AMDGPU::SI_PC_ADD_REL_OFFSET: { const SIRegisterInfo *TRI = static_cast(ST.getRegisterInfo()); MachineFunction &MF = *MBB.getParent(); unsigned Reg = MI.getOperand(0).getReg(); unsigned RegLo = TRI->getSubReg(Reg, AMDGPU::sub0); unsigned RegHi = TRI->getSubReg(Reg, AMDGPU::sub1); // Create a bundle so these instructions won't be re-ordered by the // post-RA scheduler. MIBundleBuilder Bundler(MBB, MI); Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_GETPC_B64), Reg)); // Add 32-bit offset from this instruction to the start of the // constant data. Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADD_U32), RegLo) .addReg(RegLo) .addOperand(MI.getOperand(1))); Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADDC_U32), RegHi) .addReg(RegHi) .addImm(0)); llvm::finalizeBundle(MBB, Bundler.begin()); MI.eraseFromParent(); break; } } return true; } /// Commutes the operands in the given instruction. /// The commutable operands are specified by their indices OpIdx0 and OpIdx1. /// /// Do not call this method for a non-commutable instruction or for /// non-commutable pair of operand indices OpIdx0 and OpIdx1. /// Even though the instruction is commutable, the method may still /// fail to commute the operands, null pointer is returned in such cases. MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI, unsigned OpIdx0, unsigned OpIdx1) const { int CommutedOpcode = commuteOpcode(MI); if (CommutedOpcode == -1) return nullptr; int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0); MachineOperand &Src0 = MI.getOperand(Src0Idx); if (!Src0.isReg()) return nullptr; int Src1Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src1); if ((OpIdx0 != static_cast(Src0Idx) || OpIdx1 != static_cast(Src1Idx)) && (OpIdx0 != static_cast(Src1Idx) || OpIdx1 != static_cast(Src0Idx))) return nullptr; MachineOperand &Src1 = MI.getOperand(Src1Idx); if (isVOP2(MI) || isVOPC(MI)) { const MCInstrDesc &InstrDesc = MI.getDesc(); // For VOP2 and VOPC instructions, any operand type is valid to use for // src0. Make sure we can use the src0 as src1. // // We could be stricter here and only allow commuting if there is a reason // to do so. i.e. if both operands are VGPRs there is no real benefit, // although MachineCSE attempts to find matches by commuting. const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); if (!isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src0)) return nullptr; } MachineInstr *CommutedMI = &MI; if (!Src1.isReg()) { // Allow commuting instructions with Imm operands. if (NewMI || !Src1.isImm() || (!isVOP2(MI) && !isVOP3(MI))) { return nullptr; } // Be sure to copy the source modifiers to the right place. if (MachineOperand *Src0Mods = getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)) { MachineOperand *Src1Mods = getNamedOperand(MI, AMDGPU::OpName::src1_modifiers); int Src0ModsVal = Src0Mods->getImm(); if (!Src1Mods && Src0ModsVal != 0) return nullptr; // XXX - This assert might be a lie. It might be useful to have a neg // modifier with 0.0. int Src1ModsVal = Src1Mods->getImm(); assert((Src1ModsVal == 0) && "Not expecting modifiers with immediates"); Src1Mods->setImm(Src0ModsVal); Src0Mods->setImm(Src1ModsVal); } unsigned Reg = Src0.getReg(); unsigned SubReg = Src0.getSubReg(); if (Src1.isImm()) Src0.ChangeToImmediate(Src1.getImm()); else llvm_unreachable("Should only have immediates"); Src1.ChangeToRegister(Reg, false); Src1.setSubReg(SubReg); } else { CommutedMI = TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx0, OpIdx1); } if (CommutedMI) CommutedMI->setDesc(get(CommutedOpcode)); return CommutedMI; } // This needs to be implemented because the source modifiers may be inserted // between the true commutable operands, and the base // TargetInstrInfo::commuteInstruction uses it. bool SIInstrInfo::findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx0, unsigned &SrcOpIdx1) const { const MCInstrDesc &MCID = MI.getDesc(); if (!MCID.isCommutable()) return false; unsigned Opc = MI.getOpcode(); int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); if (Src0Idx == -1) return false; // FIXME: Workaround TargetInstrInfo::commuteInstruction asserting on // immediate. Also, immediate src0 operand is not handled in // SIInstrInfo::commuteInstruction(); if (!MI.getOperand(Src0Idx).isReg()) return false; int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1); if (Src1Idx == -1) return false; MachineOperand &Src1 = MI.getOperand(Src1Idx); if (Src1.isImm()) { // SIInstrInfo::commuteInstruction() does support commuting the immediate // operand src1 in 2 and 3 operand instructions. if (!isVOP2(MI.getOpcode()) && !isVOP3(MI.getOpcode())) return false; } else if (Src1.isReg()) { // If any source modifiers are set, the generic instruction commuting won't // understand how to copy the source modifiers. if (hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) || hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers)) return false; } else return false; return fixCommutedOpIndices(SrcOpIdx0, SrcOpIdx1, Src0Idx, Src1Idx); } unsigned SIInstrInfo::getBranchOpcode(SIInstrInfo::BranchPredicate Cond) { switch (Cond) { case SIInstrInfo::SCC_TRUE: return AMDGPU::S_CBRANCH_SCC1; case SIInstrInfo::SCC_FALSE: return AMDGPU::S_CBRANCH_SCC0; case SIInstrInfo::VCCNZ: return AMDGPU::S_CBRANCH_VCCNZ; case SIInstrInfo::VCCZ: return AMDGPU::S_CBRANCH_VCCZ; case SIInstrInfo::EXECNZ: return AMDGPU::S_CBRANCH_EXECNZ; case SIInstrInfo::EXECZ: return AMDGPU::S_CBRANCH_EXECZ; default: llvm_unreachable("invalid branch predicate"); } } SIInstrInfo::BranchPredicate SIInstrInfo::getBranchPredicate(unsigned Opcode) { switch (Opcode) { case AMDGPU::S_CBRANCH_SCC0: return SCC_FALSE; case AMDGPU::S_CBRANCH_SCC1: return SCC_TRUE; case AMDGPU::S_CBRANCH_VCCNZ: return VCCNZ; case AMDGPU::S_CBRANCH_VCCZ: return VCCZ; case AMDGPU::S_CBRANCH_EXECNZ: return EXECNZ; case AMDGPU::S_CBRANCH_EXECZ: return EXECZ; default: return INVALID_BR; } } bool SIInstrInfo::analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl &Cond, bool AllowModify) const { MachineBasicBlock::iterator I = MBB.getFirstTerminator(); if (I == MBB.end()) return false; if (I->getOpcode() == AMDGPU::S_BRANCH) { // Unconditional Branch TBB = I->getOperand(0).getMBB(); return false; } BranchPredicate Pred = getBranchPredicate(I->getOpcode()); if (Pred == INVALID_BR) return true; MachineBasicBlock *CondBB = I->getOperand(0).getMBB(); Cond.push_back(MachineOperand::CreateImm(Pred)); ++I; if (I == MBB.end()) { // Conditional branch followed by fall-through. TBB = CondBB; return false; } if (I->getOpcode() == AMDGPU::S_BRANCH) { TBB = CondBB; FBB = I->getOperand(0).getMBB(); return false; } return true; } unsigned SIInstrInfo::RemoveBranch(MachineBasicBlock &MBB) const { MachineBasicBlock::iterator I = MBB.getFirstTerminator(); unsigned Count = 0; while (I != MBB.end()) { MachineBasicBlock::iterator Next = std::next(I); I->eraseFromParent(); ++Count; I = Next; } return Count; } unsigned SIInstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, ArrayRef Cond, const DebugLoc &DL) const { if (!FBB && Cond.empty()) { BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH)) .addMBB(TBB); return 1; } assert(TBB && Cond[0].isImm()); unsigned Opcode = getBranchOpcode(static_cast(Cond[0].getImm())); if (!FBB) { BuildMI(&MBB, DL, get(Opcode)) .addMBB(TBB); return 1; } assert(TBB && FBB); BuildMI(&MBB, DL, get(Opcode)) .addMBB(TBB); BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH)) .addMBB(FBB); return 2; } bool SIInstrInfo::ReverseBranchCondition( SmallVectorImpl &Cond) const { assert(Cond.size() == 1); Cond[0].setImm(-Cond[0].getImm()); return false; } static void removeModOperands(MachineInstr &MI) { unsigned Opc = MI.getOpcode(); int Src0ModIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0_modifiers); int Src1ModIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1_modifiers); int Src2ModIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2_modifiers); MI.RemoveOperand(Src2ModIdx); MI.RemoveOperand(Src1ModIdx); MI.RemoveOperand(Src0ModIdx); } // TODO: Maybe this should be removed this and custom fold everything in // SIFoldOperands? bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, unsigned Reg, MachineRegisterInfo *MRI) const { if (!MRI->hasOneNonDBGUse(Reg)) return false; unsigned Opc = UseMI.getOpcode(); if (Opc == AMDGPU::V_MAD_F32 || Opc == AMDGPU::V_MAC_F32_e64) { // Don't fold if we are using source modifiers. The new VOP2 instructions // don't have them. if (hasModifiersSet(UseMI, AMDGPU::OpName::src0_modifiers) || hasModifiersSet(UseMI, AMDGPU::OpName::src1_modifiers) || hasModifiersSet(UseMI, AMDGPU::OpName::src2_modifiers)) { return false; } const MachineOperand &ImmOp = DefMI.getOperand(1); // If this is a free constant, there's no reason to do this. // TODO: We could fold this here instead of letting SIFoldOperands do it // later. if (isInlineConstant(ImmOp, 4)) return false; MachineOperand *Src0 = getNamedOperand(UseMI, AMDGPU::OpName::src0); MachineOperand *Src1 = getNamedOperand(UseMI, AMDGPU::OpName::src1); MachineOperand *Src2 = getNamedOperand(UseMI, AMDGPU::OpName::src2); // Multiplied part is the constant: Use v_madmk_f32 // We should only expect these to be on src0 due to canonicalizations. if (Src0->isReg() && Src0->getReg() == Reg) { if (!Src1->isReg() || RI.isSGPRClass(MRI->getRegClass(Src1->getReg()))) return false; if (!Src2->isReg() || RI.isSGPRClass(MRI->getRegClass(Src2->getReg()))) return false; // We need to swap operands 0 and 1 since madmk constant is at operand 1. const int64_t Imm = DefMI.getOperand(1).getImm(); // FIXME: This would be a lot easier if we could return a new instruction // instead of having to modify in place. // Remove these first since they are at the end. UseMI.RemoveOperand( AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::omod)); UseMI.RemoveOperand( AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::clamp)); unsigned Src1Reg = Src1->getReg(); unsigned Src1SubReg = Src1->getSubReg(); Src0->setReg(Src1Reg); Src0->setSubReg(Src1SubReg); Src0->setIsKill(Src1->isKill()); if (Opc == AMDGPU::V_MAC_F32_e64) { UseMI.untieRegOperand( AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)); } Src1->ChangeToImmediate(Imm); removeModOperands(UseMI); UseMI.setDesc(get(AMDGPU::V_MADMK_F32)); bool DeleteDef = MRI->hasOneNonDBGUse(Reg); if (DeleteDef) DefMI.eraseFromParent(); return true; } // Added part is the constant: Use v_madak_f32 if (Src2->isReg() && Src2->getReg() == Reg) { // Not allowed to use constant bus for another operand. // We can however allow an inline immediate as src0. if (!Src0->isImm() && (Src0->isReg() && RI.isSGPRClass(MRI->getRegClass(Src0->getReg())))) return false; if (!Src1->isReg() || RI.isSGPRClass(MRI->getRegClass(Src1->getReg()))) return false; const int64_t Imm = DefMI.getOperand(1).getImm(); // FIXME: This would be a lot easier if we could return a new instruction // instead of having to modify in place. // Remove these first since they are at the end. UseMI.RemoveOperand( AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::omod)); UseMI.RemoveOperand( AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::clamp)); if (Opc == AMDGPU::V_MAC_F32_e64) { UseMI.untieRegOperand( AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)); } // ChangingToImmediate adds Src2 back to the instruction. Src2->ChangeToImmediate(Imm); // These come before src2. removeModOperands(UseMI); UseMI.setDesc(get(AMDGPU::V_MADAK_F32)); bool DeleteDef = MRI->hasOneNonDBGUse(Reg); if (DeleteDef) DefMI.eraseFromParent(); return true; } } return false; } static bool offsetsDoNotOverlap(int WidthA, int OffsetA, int WidthB, int OffsetB) { int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB; int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA; int LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB; return LowOffset + LowWidth <= HighOffset; } bool SIInstrInfo::checkInstOffsetsDoNotOverlap(MachineInstr &MIa, MachineInstr &MIb) const { unsigned BaseReg0, BaseReg1; int64_t Offset0, Offset1; if (getMemOpBaseRegImmOfs(MIa, BaseReg0, Offset0, &RI) && getMemOpBaseRegImmOfs(MIb, BaseReg1, Offset1, &RI)) { if (!MIa.hasOneMemOperand() || !MIb.hasOneMemOperand()) { // FIXME: Handle ds_read2 / ds_write2. return false; } unsigned Width0 = (*MIa.memoperands_begin())->getSize(); unsigned Width1 = (*MIb.memoperands_begin())->getSize(); if (BaseReg0 == BaseReg1 && offsetsDoNotOverlap(Width0, Offset0, Width1, Offset1)) { return true; } } return false; } bool SIInstrInfo::areMemAccessesTriviallyDisjoint(MachineInstr &MIa, MachineInstr &MIb, AliasAnalysis *AA) const { assert((MIa.mayLoad() || MIa.mayStore()) && "MIa must load from or modify a memory location"); assert((MIb.mayLoad() || MIb.mayStore()) && "MIb must load from or modify a memory location"); if (MIa.hasUnmodeledSideEffects() || MIb.hasUnmodeledSideEffects()) return false; // XXX - Can we relax this between address spaces? if (MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef()) return false; // TODO: Should we check the address space from the MachineMemOperand? That // would allow us to distinguish objects we know don't alias based on the // underlying address space, even if it was lowered to a different one, // e.g. private accesses lowered to use MUBUF instructions on a scratch // buffer. if (isDS(MIa)) { if (isDS(MIb)) return checkInstOffsetsDoNotOverlap(MIa, MIb); return !isFLAT(MIb); } if (isMUBUF(MIa) || isMTBUF(MIa)) { if (isMUBUF(MIb) || isMTBUF(MIb)) return checkInstOffsetsDoNotOverlap(MIa, MIb); return !isFLAT(MIb) && !isSMRD(MIb); } if (isSMRD(MIa)) { if (isSMRD(MIb)) return checkInstOffsetsDoNotOverlap(MIa, MIb); return !isFLAT(MIb) && !isMUBUF(MIa) && !isMTBUF(MIa); } if (isFLAT(MIa)) { if (isFLAT(MIb)) return checkInstOffsetsDoNotOverlap(MIa, MIb); return false; } return false; } MachineInstr *SIInstrInfo::convertToThreeAddress(MachineFunction::iterator &MBB, MachineInstr &MI, LiveVariables *LV) const { switch (MI.getOpcode()) { default: return nullptr; case AMDGPU::V_MAC_F32_e64: break; case AMDGPU::V_MAC_F32_e32: { const MachineOperand *Src0 = getNamedOperand(MI, AMDGPU::OpName::src0); if (Src0->isImm() && !isInlineConstant(*Src0, 4)) return nullptr; break; } } const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst); const MachineOperand *Src0 = getNamedOperand(MI, AMDGPU::OpName::src0); const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1); const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2); return BuildMI(*MBB, MI, MI.getDebugLoc(), get(AMDGPU::V_MAD_F32)) .addOperand(*Dst) .addImm(0) // Src0 mods .addOperand(*Src0) .addImm(0) // Src1 mods .addOperand(*Src1) .addImm(0) // Src mods .addOperand(*Src2) .addImm(0) // clamp .addImm(0); // omod } bool SIInstrInfo::isSchedulingBoundary(const MachineInstr &MI, const MachineBasicBlock *MBB, const MachineFunction &MF) const { // XXX - Do we want the SP check in the base implementation? // Target-independent instructions do not have an implicit-use of EXEC, even // when they operate on VGPRs. Treating EXEC modifications as scheduling // boundaries prevents incorrect movements of such instructions. return TargetInstrInfo::isSchedulingBoundary(MI, MBB, MF) || MI.modifiesRegister(AMDGPU::EXEC, &RI); } bool SIInstrInfo::isInlineConstant(const APInt &Imm) const { int64_t SVal = Imm.getSExtValue(); if (SVal >= -16 && SVal <= 64) return true; if (Imm.getBitWidth() == 64) { uint64_t Val = Imm.getZExtValue(); return (DoubleToBits(0.0) == Val) || (DoubleToBits(1.0) == Val) || (DoubleToBits(-1.0) == Val) || (DoubleToBits(0.5) == Val) || (DoubleToBits(-0.5) == Val) || (DoubleToBits(2.0) == Val) || (DoubleToBits(-2.0) == Val) || (DoubleToBits(4.0) == Val) || (DoubleToBits(-4.0) == Val); } // The actual type of the operand does not seem to matter as long // as the bits match one of the inline immediate values. For example: // // -nan has the hexadecimal encoding of 0xfffffffe which is -2 in decimal, // so it is a legal inline immediate. // // 1065353216 has the hexadecimal encoding 0x3f800000 which is 1.0f in // floating-point, so it is a legal inline immediate. uint32_t Val = Imm.getZExtValue(); return (FloatToBits(0.0f) == Val) || (FloatToBits(1.0f) == Val) || (FloatToBits(-1.0f) == Val) || (FloatToBits(0.5f) == Val) || (FloatToBits(-0.5f) == Val) || (FloatToBits(2.0f) == Val) || (FloatToBits(-2.0f) == Val) || (FloatToBits(4.0f) == Val) || (FloatToBits(-4.0f) == Val); } bool SIInstrInfo::isInlineConstant(const MachineOperand &MO, unsigned OpSize) const { if (MO.isImm()) { // MachineOperand provides no way to tell the true operand size, since it // only records a 64-bit value. We need to know the size to determine if a // 32-bit floating point immediate bit pattern is legal for an integer // immediate. It would be for any 32-bit integer operand, but would not be // for a 64-bit one. unsigned BitSize = 8 * OpSize; return isInlineConstant(APInt(BitSize, MO.getImm(), true)); } return false; } bool SIInstrInfo::isLiteralConstant(const MachineOperand &MO, unsigned OpSize) const { return MO.isImm() && !isInlineConstant(MO, OpSize); } static bool compareMachineOp(const MachineOperand &Op0, const MachineOperand &Op1) { if (Op0.getType() != Op1.getType()) return false; switch (Op0.getType()) { case MachineOperand::MO_Register: return Op0.getReg() == Op1.getReg(); case MachineOperand::MO_Immediate: return Op0.getImm() == Op1.getImm(); default: llvm_unreachable("Didn't expect to be comparing these operand types"); } } bool SIInstrInfo::isImmOperandLegal(const MachineInstr &MI, unsigned OpNo, const MachineOperand &MO) const { const MCOperandInfo &OpInfo = get(MI.getOpcode()).OpInfo[OpNo]; assert(MO.isImm() || MO.isTargetIndex() || MO.isFI()); if (OpInfo.OperandType == MCOI::OPERAND_IMMEDIATE) return true; if (OpInfo.RegClass < 0) return false; unsigned OpSize = RI.getRegClass(OpInfo.RegClass)->getSize(); if (isLiteralConstant(MO, OpSize)) return RI.opCanUseLiteralConstant(OpInfo.OperandType); return RI.opCanUseInlineConstant(OpInfo.OperandType); } bool SIInstrInfo::hasVALU32BitEncoding(unsigned Opcode) const { int Op32 = AMDGPU::getVOPe32(Opcode); if (Op32 == -1) return false; return pseudoToMCOpcode(Op32) != -1; } bool SIInstrInfo::hasModifiers(unsigned Opcode) const { // The src0_modifier operand is present on all instructions // that have modifiers. return AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0_modifiers) != -1; } bool SIInstrInfo::hasModifiersSet(const MachineInstr &MI, unsigned OpName) const { const MachineOperand *Mods = getNamedOperand(MI, OpName); return Mods && Mods->getImm(); } bool SIInstrInfo::usesConstantBus(const MachineRegisterInfo &MRI, const MachineOperand &MO, unsigned OpSize) const { // Literal constants use the constant bus. if (isLiteralConstant(MO, OpSize)) return true; if (!MO.isReg() || !MO.isUse()) return false; if (TargetRegisterInfo::isVirtualRegister(MO.getReg())) return RI.isSGPRClass(MRI.getRegClass(MO.getReg())); // FLAT_SCR is just an SGPR pair. if (!MO.isImplicit() && (MO.getReg() == AMDGPU::FLAT_SCR)) return true; // EXEC register uses the constant bus. if (!MO.isImplicit() && MO.getReg() == AMDGPU::EXEC) return true; // SGPRs use the constant bus return (MO.getReg() == AMDGPU::VCC || MO.getReg() == AMDGPU::M0 || (!MO.isImplicit() && (AMDGPU::SGPR_32RegClass.contains(MO.getReg()) || AMDGPU::SGPR_64RegClass.contains(MO.getReg())))); } static unsigned findImplicitSGPRRead(const MachineInstr &MI) { for (const MachineOperand &MO : MI.implicit_operands()) { // We only care about reads. if (MO.isDef()) continue; switch (MO.getReg()) { case AMDGPU::VCC: case AMDGPU::M0: case AMDGPU::FLAT_SCR: return MO.getReg(); default: break; } } return AMDGPU::NoRegister; } static bool shouldReadExec(const MachineInstr &MI) { if (SIInstrInfo::isVALU(MI)) { switch (MI.getOpcode()) { case AMDGPU::V_READLANE_B32: case AMDGPU::V_READLANE_B32_si: case AMDGPU::V_READLANE_B32_vi: case AMDGPU::V_WRITELANE_B32: case AMDGPU::V_WRITELANE_B32_si: case AMDGPU::V_WRITELANE_B32_vi: return false; } return true; } if (SIInstrInfo::isGenericOpcode(MI.getOpcode()) || SIInstrInfo::isSALU(MI) || SIInstrInfo::isSMRD(MI)) return false; return true; } bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, StringRef &ErrInfo) const { uint16_t Opcode = MI.getOpcode(); const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0); int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1); int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2); // Make sure the number of operands is correct. const MCInstrDesc &Desc = get(Opcode); if (!Desc.isVariadic() && Desc.getNumOperands() != MI.getNumExplicitOperands()) { ErrInfo = "Instruction has wrong number of operands."; return false; } // Make sure the register classes are correct. for (int i = 0, e = Desc.getNumOperands(); i != e; ++i) { if (MI.getOperand(i).isFPImm()) { ErrInfo = "FPImm Machine Operands are not supported. ISel should bitcast " "all fp values to integers."; return false; } int RegClass = Desc.OpInfo[i].RegClass; switch (Desc.OpInfo[i].OperandType) { case MCOI::OPERAND_REGISTER: if (MI.getOperand(i).isImm()) { ErrInfo = "Illegal immediate value for operand."; return false; } break; case AMDGPU::OPERAND_REG_IMM32: break; case AMDGPU::OPERAND_REG_INLINE_C: if (isLiteralConstant(MI.getOperand(i), RI.getRegClass(RegClass)->getSize())) { ErrInfo = "Illegal immediate value for operand."; return false; } break; case MCOI::OPERAND_IMMEDIATE: case AMDGPU::OPERAND_KIMM32: // Check if this operand is an immediate. // FrameIndex operands will be replaced by immediates, so they are // allowed. if (!MI.getOperand(i).isImm() && !MI.getOperand(i).isFI()) { ErrInfo = "Expected immediate, but got non-immediate"; return false; } // Fall-through default: continue; } if (!MI.getOperand(i).isReg()) continue; if (RegClass != -1) { unsigned Reg = MI.getOperand(i).getReg(); if (Reg == AMDGPU::NoRegister || TargetRegisterInfo::isVirtualRegister(Reg)) continue; const TargetRegisterClass *RC = RI.getRegClass(RegClass); if (!RC->contains(Reg)) { ErrInfo = "Operand has incorrect register class."; return false; } } } // Verify VOP* if (isVOP1(MI) || isVOP2(MI) || isVOP3(MI) || isVOPC(MI)) { // Only look at the true operands. Only a real operand can use the constant // bus, and we don't want to check pseudo-operands like the source modifier // flags. const int OpIndices[] = { Src0Idx, Src1Idx, Src2Idx }; unsigned ConstantBusCount = 0; if (AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::imm) != -1) ++ConstantBusCount; unsigned SGPRUsed = findImplicitSGPRRead(MI); if (SGPRUsed != AMDGPU::NoRegister) ++ConstantBusCount; for (int OpIdx : OpIndices) { if (OpIdx == -1) break; const MachineOperand &MO = MI.getOperand(OpIdx); if (usesConstantBus(MRI, MO, getOpSize(Opcode, OpIdx))) { if (MO.isReg()) { if (MO.getReg() != SGPRUsed) ++ConstantBusCount; SGPRUsed = MO.getReg(); } else { ++ConstantBusCount; } } } if (ConstantBusCount > 1) { ErrInfo = "VOP* instruction uses the constant bus more than once"; return false; } } // Verify misc. restrictions on specific instructions. if (Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F32 || Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F64) { const MachineOperand &Src0 = MI.getOperand(Src0Idx); const MachineOperand &Src1 = MI.getOperand(Src1Idx); const MachineOperand &Src2 = MI.getOperand(Src2Idx); if (Src0.isReg() && Src1.isReg() && Src2.isReg()) { if (!compareMachineOp(Src0, Src1) && !compareMachineOp(Src0, Src2)) { ErrInfo = "v_div_scale_{f32|f64} require src0 = src1 or src2"; return false; } } } // Make sure we aren't losing exec uses in the td files. This mostly requires // being careful when using let Uses to try to add other use registers. if (shouldReadExec(MI)) { if (!MI.hasRegisterImplicitUseOperand(AMDGPU::EXEC)) { ErrInfo = "VALU instruction does not implicitly read exec mask"; return false; } } return true; } unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) { switch (MI.getOpcode()) { default: return AMDGPU::INSTRUCTION_LIST_END; case AMDGPU::REG_SEQUENCE: return AMDGPU::REG_SEQUENCE; case AMDGPU::COPY: return AMDGPU::COPY; case AMDGPU::PHI: return AMDGPU::PHI; case AMDGPU::INSERT_SUBREG: return AMDGPU::INSERT_SUBREG; case AMDGPU::S_MOV_B32: return MI.getOperand(1).isReg() ? AMDGPU::COPY : AMDGPU::V_MOV_B32_e32; case AMDGPU::S_ADD_I32: case AMDGPU::S_ADD_U32: return AMDGPU::V_ADD_I32_e32; case AMDGPU::S_ADDC_U32: return AMDGPU::V_ADDC_U32_e32; case AMDGPU::S_SUB_I32: case AMDGPU::S_SUB_U32: return AMDGPU::V_SUB_I32_e32; case AMDGPU::S_SUBB_U32: return AMDGPU::V_SUBB_U32_e32; case AMDGPU::S_MUL_I32: return AMDGPU::V_MUL_LO_I32; case AMDGPU::S_AND_B32: return AMDGPU::V_AND_B32_e32; case AMDGPU::S_OR_B32: return AMDGPU::V_OR_B32_e32; case AMDGPU::S_XOR_B32: return AMDGPU::V_XOR_B32_e32; case AMDGPU::S_MIN_I32: return AMDGPU::V_MIN_I32_e32; case AMDGPU::S_MIN_U32: return AMDGPU::V_MIN_U32_e32; case AMDGPU::S_MAX_I32: return AMDGPU::V_MAX_I32_e32; case AMDGPU::S_MAX_U32: return AMDGPU::V_MAX_U32_e32; case AMDGPU::S_ASHR_I32: return AMDGPU::V_ASHR_I32_e32; case AMDGPU::S_ASHR_I64: return AMDGPU::V_ASHR_I64; case AMDGPU::S_LSHL_B32: return AMDGPU::V_LSHL_B32_e32; case AMDGPU::S_LSHL_B64: return AMDGPU::V_LSHL_B64; case AMDGPU::S_LSHR_B32: return AMDGPU::V_LSHR_B32_e32; case AMDGPU::S_LSHR_B64: return AMDGPU::V_LSHR_B64; case AMDGPU::S_SEXT_I32_I8: return AMDGPU::V_BFE_I32; case AMDGPU::S_SEXT_I32_I16: return AMDGPU::V_BFE_I32; case AMDGPU::S_BFE_U32: return AMDGPU::V_BFE_U32; case AMDGPU::S_BFE_I32: return AMDGPU::V_BFE_I32; case AMDGPU::S_BFM_B32: return AMDGPU::V_BFM_B32_e64; case AMDGPU::S_BREV_B32: return AMDGPU::V_BFREV_B32_e32; case AMDGPU::S_NOT_B32: return AMDGPU::V_NOT_B32_e32; case AMDGPU::S_NOT_B64: return AMDGPU::V_NOT_B32_e32; case AMDGPU::S_CMP_EQ_I32: return AMDGPU::V_CMP_EQ_I32_e32; case AMDGPU::S_CMP_LG_I32: return AMDGPU::V_CMP_NE_I32_e32; case AMDGPU::S_CMP_GT_I32: return AMDGPU::V_CMP_GT_I32_e32; case AMDGPU::S_CMP_GE_I32: return AMDGPU::V_CMP_GE_I32_e32; case AMDGPU::S_CMP_LT_I32: return AMDGPU::V_CMP_LT_I32_e32; case AMDGPU::S_CMP_LE_I32: return AMDGPU::V_CMP_LE_I32_e32; case AMDGPU::S_CMP_EQ_U32: return AMDGPU::V_CMP_EQ_U32_e32; case AMDGPU::S_CMP_LG_U32: return AMDGPU::V_CMP_NE_U32_e32; case AMDGPU::S_CMP_GT_U32: return AMDGPU::V_CMP_GT_U32_e32; case AMDGPU::S_CMP_GE_U32: return AMDGPU::V_CMP_GE_U32_e32; case AMDGPU::S_CMP_LT_U32: return AMDGPU::V_CMP_LT_U32_e32; case AMDGPU::S_CMP_LE_U32: return AMDGPU::V_CMP_LE_U32_e32; case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e64; case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32; case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32; case AMDGPU::S_FLBIT_I32: return AMDGPU::V_FFBH_I32_e64; case AMDGPU::S_CBRANCH_SCC0: return AMDGPU::S_CBRANCH_VCCZ; case AMDGPU::S_CBRANCH_SCC1: return AMDGPU::S_CBRANCH_VCCNZ; } } bool SIInstrInfo::isSALUOpSupportedOnVALU(const MachineInstr &MI) const { return getVALUOp(MI) != AMDGPU::INSTRUCTION_LIST_END; } const TargetRegisterClass *SIInstrInfo::getOpRegClass(const MachineInstr &MI, unsigned OpNo) const { const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); const MCInstrDesc &Desc = get(MI.getOpcode()); if (MI.isVariadic() || OpNo >= Desc.getNumOperands() || Desc.OpInfo[OpNo].RegClass == -1) { unsigned Reg = MI.getOperand(OpNo).getReg(); if (TargetRegisterInfo::isVirtualRegister(Reg)) return MRI.getRegClass(Reg); return RI.getPhysRegClass(Reg); } unsigned RCID = Desc.OpInfo[OpNo].RegClass; return RI.getRegClass(RCID); } bool SIInstrInfo::canReadVGPR(const MachineInstr &MI, unsigned OpNo) const { switch (MI.getOpcode()) { case AMDGPU::COPY: case AMDGPU::REG_SEQUENCE: case AMDGPU::PHI: case AMDGPU::INSERT_SUBREG: return RI.hasVGPRs(getOpRegClass(MI, 0)); default: return RI.hasVGPRs(getOpRegClass(MI, OpNo)); } } void SIInstrInfo::legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const { MachineBasicBlock::iterator I = MI; MachineBasicBlock *MBB = MI.getParent(); MachineOperand &MO = MI.getOperand(OpIdx); MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); unsigned RCID = get(MI.getOpcode()).OpInfo[OpIdx].RegClass; const TargetRegisterClass *RC = RI.getRegClass(RCID); unsigned Opcode = AMDGPU::V_MOV_B32_e32; if (MO.isReg()) Opcode = AMDGPU::COPY; else if (RI.isSGPRClass(RC)) Opcode = AMDGPU::S_MOV_B32; const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(RC); if (RI.getCommonSubClass(&AMDGPU::VReg_64RegClass, VRC)) VRC = &AMDGPU::VReg_64RegClass; else VRC = &AMDGPU::VGPR_32RegClass; unsigned Reg = MRI.createVirtualRegister(VRC); DebugLoc DL = MBB->findDebugLoc(I); BuildMI(*MI.getParent(), I, DL, get(Opcode), Reg).addOperand(MO); MO.ChangeToRegister(Reg, false); } unsigned SIInstrInfo::buildExtractSubReg(MachineBasicBlock::iterator MI, MachineRegisterInfo &MRI, MachineOperand &SuperReg, const TargetRegisterClass *SuperRC, unsigned SubIdx, const TargetRegisterClass *SubRC) const { MachineBasicBlock *MBB = MI->getParent(); DebugLoc DL = MI->getDebugLoc(); unsigned SubReg = MRI.createVirtualRegister(SubRC); if (SuperReg.getSubReg() == AMDGPU::NoSubRegister) { BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg) .addReg(SuperReg.getReg(), 0, SubIdx); return SubReg; } // Just in case the super register is itself a sub-register, copy it to a new // value so we don't need to worry about merging its subreg index with the // SubIdx passed to this function. The register coalescer should be able to // eliminate this extra copy. unsigned NewSuperReg = MRI.createVirtualRegister(SuperRC); BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), NewSuperReg) .addReg(SuperReg.getReg(), 0, SuperReg.getSubReg()); BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg) .addReg(NewSuperReg, 0, SubIdx); return SubReg; } MachineOperand SIInstrInfo::buildExtractSubRegOrImm( MachineBasicBlock::iterator MII, MachineRegisterInfo &MRI, MachineOperand &Op, const TargetRegisterClass *SuperRC, unsigned SubIdx, const TargetRegisterClass *SubRC) const { if (Op.isImm()) { // XXX - Is there a better way to do this? if (SubIdx == AMDGPU::sub0) return MachineOperand::CreateImm(Op.getImm() & 0xFFFFFFFF); if (SubIdx == AMDGPU::sub1) return MachineOperand::CreateImm(Op.getImm() >> 32); llvm_unreachable("Unhandled register index for immediate"); } unsigned SubReg = buildExtractSubReg(MII, MRI, Op, SuperRC, SubIdx, SubRC); return MachineOperand::CreateReg(SubReg, false); } // Change the order of operands from (0, 1, 2) to (0, 2, 1) void SIInstrInfo::swapOperands(MachineInstr &Inst) const { assert(Inst.getNumExplicitOperands() == 3); MachineOperand Op1 = Inst.getOperand(1); Inst.RemoveOperand(1); Inst.addOperand(Op1); } bool SIInstrInfo::isLegalRegOperand(const MachineRegisterInfo &MRI, const MCOperandInfo &OpInfo, const MachineOperand &MO) const { if (!MO.isReg()) return false; unsigned Reg = MO.getReg(); const TargetRegisterClass *RC = TargetRegisterInfo::isVirtualRegister(Reg) ? MRI.getRegClass(Reg) : RI.getPhysRegClass(Reg); const SIRegisterInfo *TRI = static_cast(MRI.getTargetRegisterInfo()); RC = TRI->getSubRegClass(RC, MO.getSubReg()); // In order to be legal, the common sub-class must be equal to the // class of the current operand. For example: // // v_mov_b32 s0 ; Operand defined as vsrc_32 // ; RI.getCommonSubClass(s0,vsrc_32) = sgpr ; LEGAL // // s_sendmsg 0, s0 ; Operand defined as m0reg // ; RI.getCommonSubClass(s0,m0reg) = m0reg ; NOT LEGAL return RI.getCommonSubClass(RC, RI.getRegClass(OpInfo.RegClass)) == RC; } bool SIInstrInfo::isLegalVSrcOperand(const MachineRegisterInfo &MRI, const MCOperandInfo &OpInfo, const MachineOperand &MO) const { if (MO.isReg()) return isLegalRegOperand(MRI, OpInfo, MO); // Handle non-register types that are treated like immediates. assert(MO.isImm() || MO.isTargetIndex() || MO.isFI()); return true; } bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx, const MachineOperand *MO) const { const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); const MCInstrDesc &InstDesc = MI.getDesc(); const MCOperandInfo &OpInfo = InstDesc.OpInfo[OpIdx]; const TargetRegisterClass *DefinedRC = OpInfo.RegClass != -1 ? RI.getRegClass(OpInfo.RegClass) : nullptr; if (!MO) MO = &MI.getOperand(OpIdx); if (isVALU(MI) && usesConstantBus(MRI, *MO, DefinedRC->getSize())) { RegSubRegPair SGPRUsed; if (MO->isReg()) SGPRUsed = RegSubRegPair(MO->getReg(), MO->getSubReg()); for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { if (i == OpIdx) continue; const MachineOperand &Op = MI.getOperand(i); if (Op.isReg()) { if ((Op.getReg() != SGPRUsed.Reg || Op.getSubReg() != SGPRUsed.SubReg) && usesConstantBus(MRI, Op, getOpSize(MI, i))) { return false; } } else if (InstDesc.OpInfo[i].OperandType == AMDGPU::OPERAND_KIMM32) { return false; } } } if (MO->isReg()) { assert(DefinedRC); return isLegalRegOperand(MRI, OpInfo, *MO); } // Handle non-register types that are treated like immediates. assert(MO->isImm() || MO->isTargetIndex() || MO->isFI()); if (!DefinedRC) { // This operand expects an immediate. return true; } return isImmOperandLegal(MI, OpIdx, *MO); } void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI, MachineInstr &MI) const { unsigned Opc = MI.getOpcode(); const MCInstrDesc &InstrDesc = get(Opc); int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1); MachineOperand &Src1 = MI.getOperand(Src1Idx); // If there is an implicit SGPR use such as VCC use for v_addc_u32/v_subb_u32 // we need to only have one constant bus use. // // Note we do not need to worry about literal constants here. They are // disabled for the operand type for instructions because they will always // violate the one constant bus use rule. bool HasImplicitSGPR = findImplicitSGPRRead(MI) != AMDGPU::NoRegister; if (HasImplicitSGPR) { int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); MachineOperand &Src0 = MI.getOperand(Src0Idx); if (Src0.isReg() && RI.isSGPRReg(MRI, Src0.getReg())) legalizeOpWithMove(MI, Src0Idx); } // VOP2 src0 instructions support all operand types, so we don't need to check // their legality. If src1 is already legal, we don't need to do anything. if (isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src1)) return; // We do not use commuteInstruction here because it is too aggressive and will // commute if it is possible. We only want to commute here if it improves // legality. This can be called a fairly large number of times so don't waste // compile time pointlessly swapping and checking legality again. if (HasImplicitSGPR || !MI.isCommutable()) { legalizeOpWithMove(MI, Src1Idx); return; } int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); MachineOperand &Src0 = MI.getOperand(Src0Idx); // If src0 can be used as src1, commuting will make the operands legal. // Otherwise we have to give up and insert a move. // // TODO: Other immediate-like operand kinds could be commuted if there was a // MachineOperand::ChangeTo* for them. if ((!Src1.isImm() && !Src1.isReg()) || !isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src0)) { legalizeOpWithMove(MI, Src1Idx); return; } int CommutedOpc = commuteOpcode(MI); if (CommutedOpc == -1) { legalizeOpWithMove(MI, Src1Idx); return; } MI.setDesc(get(CommutedOpc)); unsigned Src0Reg = Src0.getReg(); unsigned Src0SubReg = Src0.getSubReg(); bool Src0Kill = Src0.isKill(); if (Src1.isImm()) Src0.ChangeToImmediate(Src1.getImm()); else if (Src1.isReg()) { Src0.ChangeToRegister(Src1.getReg(), false, false, Src1.isKill()); Src0.setSubReg(Src1.getSubReg()); } else llvm_unreachable("Should only have register or immediate operands"); Src1.ChangeToRegister(Src0Reg, false, false, Src0Kill); Src1.setSubReg(Src0SubReg); } // Legalize VOP3 operands. Because all operand types are supported for any // operand, and since literal constants are not allowed and should never be // seen, we only need to worry about inserting copies if we use multiple SGPR // operands. void SIInstrInfo::legalizeOperandsVOP3(MachineRegisterInfo &MRI, MachineInstr &MI) const { unsigned Opc = MI.getOpcode(); int VOP3Idx[3] = { AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0), AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1), AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2) }; // Find the one SGPR operand we are allowed to use. unsigned SGPRReg = findUsedSGPR(MI, VOP3Idx); for (unsigned i = 0; i < 3; ++i) { int Idx = VOP3Idx[i]; if (Idx == -1) break; MachineOperand &MO = MI.getOperand(Idx); // We should never see a VOP3 instruction with an illegal immediate operand. if (!MO.isReg()) continue; if (!RI.isSGPRClass(MRI.getRegClass(MO.getReg()))) continue; // VGPRs are legal if (SGPRReg == AMDGPU::NoRegister || SGPRReg == MO.getReg()) { SGPRReg = MO.getReg(); // We can use one SGPR in each VOP3 instruction. continue; } // If we make it this far, then the operand is not legal and we must // legalize it. legalizeOpWithMove(MI, Idx); } } unsigned SIInstrInfo::readlaneVGPRToSGPR(unsigned SrcReg, MachineInstr &UseMI, MachineRegisterInfo &MRI) const { const TargetRegisterClass *VRC = MRI.getRegClass(SrcReg); const TargetRegisterClass *SRC = RI.getEquivalentSGPRClass(VRC); unsigned DstReg = MRI.createVirtualRegister(SRC); unsigned SubRegs = VRC->getSize() / 4; SmallVector SRegs; for (unsigned i = 0; i < SubRegs; ++i) { unsigned SGPR = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(), get(AMDGPU::V_READFIRSTLANE_B32), SGPR) .addReg(SrcReg, 0, RI.getSubRegFromChannel(i)); SRegs.push_back(SGPR); } MachineInstrBuilder MIB = BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(), get(AMDGPU::REG_SEQUENCE), DstReg); for (unsigned i = 0; i < SubRegs; ++i) { MIB.addReg(SRegs[i]); MIB.addImm(RI.getSubRegFromChannel(i)); } return DstReg; } void SIInstrInfo::legalizeOperandsSMRD(MachineRegisterInfo &MRI, MachineInstr &MI) const { // If the pointer is store in VGPRs, then we need to move them to // SGPRs using v_readfirstlane. This is safe because we only select // loads with uniform pointers to SMRD instruction so we know the // pointer value is uniform. MachineOperand *SBase = getNamedOperand(MI, AMDGPU::OpName::sbase); if (SBase && !RI.isSGPRClass(MRI.getRegClass(SBase->getReg()))) { unsigned SGPR = readlaneVGPRToSGPR(SBase->getReg(), MI, MRI); SBase->setReg(SGPR); } } void SIInstrInfo::legalizeOperands(MachineInstr &MI) const { - MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); + MachineFunction &MF = *MI.getParent()->getParent(); + MachineRegisterInfo &MRI = MF.getRegInfo(); // Legalize VOP2 if (isVOP2(MI) || isVOPC(MI)) { legalizeOperandsVOP2(MRI, MI); return; } // Legalize VOP3 if (isVOP3(MI)) { legalizeOperandsVOP3(MRI, MI); return; } // Legalize SMRD if (isSMRD(MI)) { legalizeOperandsSMRD(MRI, MI); return; } // Legalize REG_SEQUENCE and PHI // The register class of the operands much be the same type as the register // class of the output. if (MI.getOpcode() == AMDGPU::PHI) { const TargetRegisterClass *RC = nullptr, *SRC = nullptr, *VRC = nullptr; for (unsigned i = 1, e = MI.getNumOperands(); i != e; i += 2) { if (!MI.getOperand(i).isReg() || !TargetRegisterInfo::isVirtualRegister(MI.getOperand(i).getReg())) continue; const TargetRegisterClass *OpRC = MRI.getRegClass(MI.getOperand(i).getReg()); if (RI.hasVGPRs(OpRC)) { VRC = OpRC; } else { SRC = OpRC; } } // If any of the operands are VGPR registers, then they all most be // otherwise we will create illegal VGPR->SGPR copies when legalizing // them. if (VRC || !RI.isSGPRClass(getOpRegClass(MI, 0))) { if (!VRC) { assert(SRC); VRC = RI.getEquivalentVGPRClass(SRC); } RC = VRC; } else { RC = SRC; } // Update all the operands so they have the same type. for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) { MachineOperand &Op = MI.getOperand(I); if (!Op.isReg() || !TargetRegisterInfo::isVirtualRegister(Op.getReg())) continue; unsigned DstReg = MRI.createVirtualRegister(RC); // MI is a PHI instruction. MachineBasicBlock *InsertBB = MI.getOperand(I + 1).getMBB(); MachineBasicBlock::iterator Insert = InsertBB->getFirstTerminator(); BuildMI(*InsertBB, Insert, MI.getDebugLoc(), get(AMDGPU::COPY), DstReg) .addOperand(Op); Op.setReg(DstReg); } } // REG_SEQUENCE doesn't really require operand legalization, but if one has a // VGPR dest type and SGPR sources, insert copies so all operands are // VGPRs. This seems to help operand folding / the register coalescer. if (MI.getOpcode() == AMDGPU::REG_SEQUENCE) { MachineBasicBlock *MBB = MI.getParent(); const TargetRegisterClass *DstRC = getOpRegClass(MI, 0); if (RI.hasVGPRs(DstRC)) { // Update all the operands so they are VGPR register classes. These may // not be the same register class because REG_SEQUENCE supports mixing // subregister index types e.g. sub0_sub1 + sub2 + sub3 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) { MachineOperand &Op = MI.getOperand(I); if (!Op.isReg() || !TargetRegisterInfo::isVirtualRegister(Op.getReg())) continue; const TargetRegisterClass *OpRC = MRI.getRegClass(Op.getReg()); const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(OpRC); if (VRC == OpRC) continue; unsigned DstReg = MRI.createVirtualRegister(VRC); BuildMI(*MBB, MI, MI.getDebugLoc(), get(AMDGPU::COPY), DstReg) .addOperand(Op); Op.setReg(DstReg); Op.setIsKill(); } } return; } // Legalize INSERT_SUBREG // src0 must have the same register class as dst if (MI.getOpcode() == AMDGPU::INSERT_SUBREG) { unsigned Dst = MI.getOperand(0).getReg(); unsigned Src0 = MI.getOperand(1).getReg(); const TargetRegisterClass *DstRC = MRI.getRegClass(Dst); const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0); if (DstRC != Src0RC) { MachineBasicBlock &MBB = *MI.getParent(); unsigned NewSrc0 = MRI.createVirtualRegister(DstRC); BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::COPY), NewSrc0) .addReg(Src0); MI.getOperand(1).setReg(NewSrc0); } return; } - // Legalize MIMG - if (isMIMG(MI)) { + // Legalize MIMG and MUBUF/MTBUF for shaders. + // + // Shaders only generate MUBUF/MTBUF instructions via intrinsics or via + // scratch memory access. In both cases, the legalization never involves + // conversion to the addr64 form. + if (isMIMG(MI) || + (AMDGPU::isShader(MF.getFunction()->getCallingConv()) && + (isMUBUF(MI) || isMTBUF(MI)))) { MachineOperand *SRsrc = getNamedOperand(MI, AMDGPU::OpName::srsrc); if (SRsrc && !RI.isSGPRClass(MRI.getRegClass(SRsrc->getReg()))) { unsigned SGPR = readlaneVGPRToSGPR(SRsrc->getReg(), MI, MRI); SRsrc->setReg(SGPR); } MachineOperand *SSamp = getNamedOperand(MI, AMDGPU::OpName::ssamp); if (SSamp && !RI.isSGPRClass(MRI.getRegClass(SSamp->getReg()))) { unsigned SGPR = readlaneVGPRToSGPR(SSamp->getReg(), MI, MRI); SSamp->setReg(SGPR); } return; } - // Legalize MUBUF* instructions + // Legalize MUBUF* instructions by converting to addr64 form. // FIXME: If we start using the non-addr64 instructions for compute, we - // may need to legalize them here. + // may need to legalize them as above. This especially applies to the + // buffer_load_format_* variants and variants with idxen (or bothen). int SRsrcIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::srsrc); if (SRsrcIdx != -1) { // We have an MUBUF instruction MachineOperand *SRsrc = &MI.getOperand(SRsrcIdx); unsigned SRsrcRC = get(MI.getOpcode()).OpInfo[SRsrcIdx].RegClass; if (RI.getCommonSubClass(MRI.getRegClass(SRsrc->getReg()), RI.getRegClass(SRsrcRC))) { // The operands are legal. // FIXME: We may need to legalize operands besided srsrc. return; } MachineBasicBlock &MBB = *MI.getParent(); // Extract the ptr from the resource descriptor. unsigned SRsrcPtr = buildExtractSubReg(MI, MRI, *SRsrc, &AMDGPU::VReg_128RegClass, AMDGPU::sub0_sub1, &AMDGPU::VReg_64RegClass); // Create an empty resource descriptor unsigned Zero64 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); unsigned SRsrcFormatLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); unsigned SRsrcFormatHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); unsigned NewSRsrc = MRI.createVirtualRegister(&AMDGPU::SReg_128RegClass); uint64_t RsrcDataFormat = getDefaultRsrcDataFormat(); // Zero64 = 0 BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::S_MOV_B64), Zero64) .addImm(0); // SRsrcFormatLo = RSRC_DATA_FORMAT{31-0} BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::S_MOV_B32), SRsrcFormatLo) .addImm(RsrcDataFormat & 0xFFFFFFFF); // SRsrcFormatHi = RSRC_DATA_FORMAT{63-32} BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::S_MOV_B32), SRsrcFormatHi) .addImm(RsrcDataFormat >> 32); // NewSRsrc = {Zero64, SRsrcFormat} BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewSRsrc) .addReg(Zero64) .addImm(AMDGPU::sub0_sub1) .addReg(SRsrcFormatLo) .addImm(AMDGPU::sub2) .addReg(SRsrcFormatHi) .addImm(AMDGPU::sub3); MachineOperand *VAddr = getNamedOperand(MI, AMDGPU::OpName::vaddr); unsigned NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); if (VAddr) { // This is already an ADDR64 instruction so we need to add the pointer // extracted from the resource descriptor to the current value of VAddr. unsigned NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); unsigned NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); // NewVaddrLo = SRsrcPtr:sub0 + VAddr:sub0 DebugLoc DL = MI.getDebugLoc(); BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_I32_e32), NewVAddrLo) .addReg(SRsrcPtr, 0, AMDGPU::sub0) .addReg(VAddr->getReg(), 0, AMDGPU::sub0); // NewVaddrHi = SRsrcPtr:sub1 + VAddr:sub1 BuildMI(MBB, MI, DL, get(AMDGPU::V_ADDC_U32_e32), NewVAddrHi) .addReg(SRsrcPtr, 0, AMDGPU::sub1) .addReg(VAddr->getReg(), 0, AMDGPU::sub1); // NewVaddr = {NewVaddrHi, NewVaddrLo} BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewVAddr) .addReg(NewVAddrLo) .addImm(AMDGPU::sub0) .addReg(NewVAddrHi) .addImm(AMDGPU::sub1); } else { // This instructions is the _OFFSET variant, so we need to convert it to // ADDR64. assert(MBB.getParent()->getSubtarget().getGeneration() < SISubtarget::VOLCANIC_ISLANDS && "FIXME: Need to emit flat atomics here"); MachineOperand *VData = getNamedOperand(MI, AMDGPU::OpName::vdata); MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset); MachineOperand *SOffset = getNamedOperand(MI, AMDGPU::OpName::soffset); unsigned Addr64Opcode = AMDGPU::getAddr64Inst(MI.getOpcode()); // Atomics rith return have have an additional tied operand and are // missing some of the special bits. MachineOperand *VDataIn = getNamedOperand(MI, AMDGPU::OpName::vdata_in); MachineInstr *Addr64; if (!VDataIn) { // Regular buffer load / store. MachineInstrBuilder MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode)) .addOperand(*VData) .addReg(AMDGPU::NoRegister) // Dummy value for vaddr. // This will be replaced later // with the new value of vaddr. .addOperand(*SRsrc) .addOperand(*SOffset) .addOperand(*Offset); // Atomics do not have this operand. if (const MachineOperand *GLC = getNamedOperand(MI, AMDGPU::OpName::glc)) { MIB.addImm(GLC->getImm()); } MIB.addImm(getNamedImmOperand(MI, AMDGPU::OpName::slc)); if (const MachineOperand *TFE = getNamedOperand(MI, AMDGPU::OpName::tfe)) { MIB.addImm(TFE->getImm()); } MIB.setMemRefs(MI.memoperands_begin(), MI.memoperands_end()); Addr64 = MIB; } else { // Atomics with return. Addr64 = BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode)) .addOperand(*VData) .addOperand(*VDataIn) .addReg(AMDGPU::NoRegister) // Dummy value for vaddr. // This will be replaced later // with the new value of vaddr. .addOperand(*SRsrc) .addOperand(*SOffset) .addOperand(*Offset) .addImm(getNamedImmOperand(MI, AMDGPU::OpName::slc)) .setMemRefs(MI.memoperands_begin(), MI.memoperands_end()); } MI.removeFromParent(); // NewVaddr = {NewVaddrHi, NewVaddrLo} BuildMI(MBB, Addr64, Addr64->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewVAddr) .addReg(SRsrcPtr, 0, AMDGPU::sub0) .addImm(AMDGPU::sub0) .addReg(SRsrcPtr, 0, AMDGPU::sub1) .addImm(AMDGPU::sub1); VAddr = getNamedOperand(*Addr64, AMDGPU::OpName::vaddr); SRsrc = getNamedOperand(*Addr64, AMDGPU::OpName::srsrc); } // Update the instruction to use NewVaddr VAddr->setReg(NewVAddr); // Update the instruction to use NewSRsrc SRsrc->setReg(NewSRsrc); } } void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const { SmallVector Worklist; Worklist.push_back(&TopInst); while (!Worklist.empty()) { MachineInstr &Inst = *Worklist.pop_back_val(); MachineBasicBlock *MBB = Inst.getParent(); MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); unsigned Opcode = Inst.getOpcode(); unsigned NewOpcode = getVALUOp(Inst); // Handle some special cases switch (Opcode) { default: break; case AMDGPU::S_AND_B64: splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_AND_B32_e64); Inst.eraseFromParent(); continue; case AMDGPU::S_OR_B64: splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_OR_B32_e64); Inst.eraseFromParent(); continue; case AMDGPU::S_XOR_B64: splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_XOR_B32_e64); Inst.eraseFromParent(); continue; case AMDGPU::S_NOT_B64: splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::V_NOT_B32_e32); Inst.eraseFromParent(); continue; case AMDGPU::S_BCNT1_I32_B64: splitScalar64BitBCNT(Worklist, Inst); Inst.eraseFromParent(); continue; case AMDGPU::S_BFE_I64: { splitScalar64BitBFE(Worklist, Inst); Inst.eraseFromParent(); continue; } case AMDGPU::S_LSHL_B32: if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) { NewOpcode = AMDGPU::V_LSHLREV_B32_e64; swapOperands(Inst); } break; case AMDGPU::S_ASHR_I32: if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) { NewOpcode = AMDGPU::V_ASHRREV_I32_e64; swapOperands(Inst); } break; case AMDGPU::S_LSHR_B32: if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) { NewOpcode = AMDGPU::V_LSHRREV_B32_e64; swapOperands(Inst); } break; case AMDGPU::S_LSHL_B64: if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) { NewOpcode = AMDGPU::V_LSHLREV_B64; swapOperands(Inst); } break; case AMDGPU::S_ASHR_I64: if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) { NewOpcode = AMDGPU::V_ASHRREV_I64; swapOperands(Inst); } break; case AMDGPU::S_LSHR_B64: if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) { NewOpcode = AMDGPU::V_LSHRREV_B64; swapOperands(Inst); } break; case AMDGPU::S_ABS_I32: lowerScalarAbs(Worklist, Inst); Inst.eraseFromParent(); continue; case AMDGPU::S_CBRANCH_SCC0: case AMDGPU::S_CBRANCH_SCC1: // Clear unused bits of vcc BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(AMDGPU::S_AND_B64), AMDGPU::VCC) .addReg(AMDGPU::EXEC) .addReg(AMDGPU::VCC); break; case AMDGPU::S_BFE_U64: case AMDGPU::S_BFM_B64: llvm_unreachable("Moving this op to VALU not implemented"); } if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) { // We cannot move this instruction to the VALU, so we should try to // legalize its operands instead. legalizeOperands(Inst); continue; } // Use the new VALU Opcode. const MCInstrDesc &NewDesc = get(NewOpcode); Inst.setDesc(NewDesc); // Remove any references to SCC. Vector instructions can't read from it, and // We're just about to add the implicit use / defs of VCC, and we don't want // both. for (unsigned i = Inst.getNumOperands() - 1; i > 0; --i) { MachineOperand &Op = Inst.getOperand(i); if (Op.isReg() && Op.getReg() == AMDGPU::SCC) { Inst.RemoveOperand(i); addSCCDefUsersToVALUWorklist(Inst, Worklist); } } if (Opcode == AMDGPU::S_SEXT_I32_I8 || Opcode == AMDGPU::S_SEXT_I32_I16) { // We are converting these to a BFE, so we need to add the missing // operands for the size and offset. unsigned Size = (Opcode == AMDGPU::S_SEXT_I32_I8) ? 8 : 16; Inst.addOperand(MachineOperand::CreateImm(0)); Inst.addOperand(MachineOperand::CreateImm(Size)); } else if (Opcode == AMDGPU::S_BCNT1_I32_B32) { // The VALU version adds the second operand to the result, so insert an // extra 0 operand. Inst.addOperand(MachineOperand::CreateImm(0)); } Inst.addImplicitDefUseOperands(*Inst.getParent()->getParent()); if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) { const MachineOperand &OffsetWidthOp = Inst.getOperand(2); // If we need to move this to VGPRs, we need to unpack the second operand // back into the 2 separate ones for bit offset and width. assert(OffsetWidthOp.isImm() && "Scalar BFE is only implemented for constant width and offset"); uint32_t Imm = OffsetWidthOp.getImm(); uint32_t Offset = Imm & 0x3f; // Extract bits [5:0]. uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16]. Inst.RemoveOperand(2); // Remove old immediate. Inst.addOperand(MachineOperand::CreateImm(Offset)); Inst.addOperand(MachineOperand::CreateImm(BitWidth)); } bool HasDst = Inst.getOperand(0).isReg() && Inst.getOperand(0).isDef(); unsigned NewDstReg = AMDGPU::NoRegister; if (HasDst) { // Update the destination register class. const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(Inst); if (!NewDstRC) continue; unsigned DstReg = Inst.getOperand(0).getReg(); NewDstReg = MRI.createVirtualRegister(NewDstRC); MRI.replaceRegWith(DstReg, NewDstReg); } // Legalize the operands legalizeOperands(Inst); if (HasDst) addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist); } } void SIInstrInfo::lowerScalarAbs(SmallVectorImpl &Worklist, MachineInstr &Inst) const { MachineBasicBlock &MBB = *Inst.getParent(); MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); MachineBasicBlock::iterator MII = Inst; DebugLoc DL = Inst.getDebugLoc(); MachineOperand &Dest = Inst.getOperand(0); MachineOperand &Src = Inst.getOperand(1); unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); BuildMI(MBB, MII, DL, get(AMDGPU::V_SUB_I32_e32), TmpReg) .addImm(0) .addReg(Src.getReg()); BuildMI(MBB, MII, DL, get(AMDGPU::V_MAX_I32_e64), ResultReg) .addReg(Src.getReg()) .addReg(TmpReg); MRI.replaceRegWith(Dest.getReg(), ResultReg); addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); } void SIInstrInfo::splitScalar64BitUnaryOp( SmallVectorImpl &Worklist, MachineInstr &Inst, unsigned Opcode) const { MachineBasicBlock &MBB = *Inst.getParent(); MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); MachineOperand &Dest = Inst.getOperand(0); MachineOperand &Src0 = Inst.getOperand(1); DebugLoc DL = Inst.getDebugLoc(); MachineBasicBlock::iterator MII = Inst; const MCInstrDesc &InstDesc = get(Opcode); const TargetRegisterClass *Src0RC = Src0.isReg() ? MRI.getRegClass(Src0.getReg()) : &AMDGPU::SGPR_32RegClass; const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0); MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC); const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg()); const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC); const TargetRegisterClass *NewDestSubRC = RI.getSubRegClass(NewDestRC, AMDGPU::sub0); unsigned DestSub0 = MRI.createVirtualRegister(NewDestSubRC); BuildMI(MBB, MII, DL, InstDesc, DestSub0) .addOperand(SrcReg0Sub0); MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC); unsigned DestSub1 = MRI.createVirtualRegister(NewDestSubRC); BuildMI(MBB, MII, DL, InstDesc, DestSub1) .addOperand(SrcReg0Sub1); unsigned FullDestReg = MRI.createVirtualRegister(NewDestRC); BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg) .addReg(DestSub0) .addImm(AMDGPU::sub0) .addReg(DestSub1) .addImm(AMDGPU::sub1); MRI.replaceRegWith(Dest.getReg(), FullDestReg); // We don't need to legalizeOperands here because for a single operand, src0 // will support any kind of input. // Move all users of this moved value. addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist); } void SIInstrInfo::splitScalar64BitBinaryOp( SmallVectorImpl &Worklist, MachineInstr &Inst, unsigned Opcode) const { MachineBasicBlock &MBB = *Inst.getParent(); MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); MachineOperand &Dest = Inst.getOperand(0); MachineOperand &Src0 = Inst.getOperand(1); MachineOperand &Src1 = Inst.getOperand(2); DebugLoc DL = Inst.getDebugLoc(); MachineBasicBlock::iterator MII = Inst; const MCInstrDesc &InstDesc = get(Opcode); const TargetRegisterClass *Src0RC = Src0.isReg() ? MRI.getRegClass(Src0.getReg()) : &AMDGPU::SGPR_32RegClass; const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0); const TargetRegisterClass *Src1RC = Src1.isReg() ? MRI.getRegClass(Src1.getReg()) : &AMDGPU::SGPR_32RegClass; const TargetRegisterClass *Src1SubRC = RI.getSubRegClass(Src1RC, AMDGPU::sub0); MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC); MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC); const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg()); const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC); const TargetRegisterClass *NewDestSubRC = RI.getSubRegClass(NewDestRC, AMDGPU::sub0); unsigned DestSub0 = MRI.createVirtualRegister(NewDestSubRC); MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0) .addOperand(SrcReg0Sub0) .addOperand(SrcReg1Sub0); MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC); MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC); unsigned DestSub1 = MRI.createVirtualRegister(NewDestSubRC); MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1) .addOperand(SrcReg0Sub1) .addOperand(SrcReg1Sub1); unsigned FullDestReg = MRI.createVirtualRegister(NewDestRC); BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg) .addReg(DestSub0) .addImm(AMDGPU::sub0) .addReg(DestSub1) .addImm(AMDGPU::sub1); MRI.replaceRegWith(Dest.getReg(), FullDestReg); // Try to legalize the operands in case we need to swap the order to keep it // valid. legalizeOperands(LoHalf); legalizeOperands(HiHalf); // Move all users of this moved vlaue. addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist); } void SIInstrInfo::splitScalar64BitBCNT( SmallVectorImpl &Worklist, MachineInstr &Inst) const { MachineBasicBlock &MBB = *Inst.getParent(); MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); MachineBasicBlock::iterator MII = Inst; DebugLoc DL = Inst.getDebugLoc(); MachineOperand &Dest = Inst.getOperand(0); MachineOperand &Src = Inst.getOperand(1); const MCInstrDesc &InstDesc = get(AMDGPU::V_BCNT_U32_B32_e64); const TargetRegisterClass *SrcRC = Src.isReg() ? MRI.getRegClass(Src.getReg()) : &AMDGPU::SGPR_32RegClass; unsigned MidReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); const TargetRegisterClass *SrcSubRC = RI.getSubRegClass(SrcRC, AMDGPU::sub0); MachineOperand SrcRegSub0 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, AMDGPU::sub0, SrcSubRC); MachineOperand SrcRegSub1 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, AMDGPU::sub1, SrcSubRC); BuildMI(MBB, MII, DL, InstDesc, MidReg) .addOperand(SrcRegSub0) .addImm(0); BuildMI(MBB, MII, DL, InstDesc, ResultReg) .addOperand(SrcRegSub1) .addReg(MidReg); MRI.replaceRegWith(Dest.getReg(), ResultReg); // We don't need to legalize operands here. src0 for etiher instruction can be // an SGPR, and the second input is unused or determined here. addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); } void SIInstrInfo::splitScalar64BitBFE(SmallVectorImpl &Worklist, MachineInstr &Inst) const { MachineBasicBlock &MBB = *Inst.getParent(); MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); MachineBasicBlock::iterator MII = Inst; DebugLoc DL = Inst.getDebugLoc(); MachineOperand &Dest = Inst.getOperand(0); uint32_t Imm = Inst.getOperand(2).getImm(); uint32_t Offset = Imm & 0x3f; // Extract bits [5:0]. uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16]. (void) Offset; // Only sext_inreg cases handled. assert(Inst.getOpcode() == AMDGPU::S_BFE_I64 && BitWidth <= 32 && Offset == 0 && "Not implemented"); if (BitWidth < 32) { unsigned MidRegLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); unsigned MidRegHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); BuildMI(MBB, MII, DL, get(AMDGPU::V_BFE_I32), MidRegLo) .addReg(Inst.getOperand(1).getReg(), 0, AMDGPU::sub0) .addImm(0) .addImm(BitWidth); BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e32), MidRegHi) .addImm(31) .addReg(MidRegLo); BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg) .addReg(MidRegLo) .addImm(AMDGPU::sub0) .addReg(MidRegHi) .addImm(AMDGPU::sub1); MRI.replaceRegWith(Dest.getReg(), ResultReg); addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); return; } MachineOperand &Src = Inst.getOperand(1); unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e64), TmpReg) .addImm(31) .addReg(Src.getReg(), 0, AMDGPU::sub0); BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg) .addReg(Src.getReg(), 0, AMDGPU::sub0) .addImm(AMDGPU::sub0) .addReg(TmpReg) .addImm(AMDGPU::sub1); MRI.replaceRegWith(Dest.getReg(), ResultReg); addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); } void SIInstrInfo::addUsersToMoveToVALUWorklist( unsigned DstReg, MachineRegisterInfo &MRI, SmallVectorImpl &Worklist) const { for (MachineRegisterInfo::use_iterator I = MRI.use_begin(DstReg), E = MRI.use_end(); I != E; ++I) { MachineInstr &UseMI = *I->getParent(); if (!canReadVGPR(UseMI, I.getOperandNo())) { Worklist.push_back(&UseMI); } } } void SIInstrInfo::addSCCDefUsersToVALUWorklist( MachineInstr &SCCDefInst, SmallVectorImpl &Worklist) const { // This assumes that all the users of SCC are in the same block // as the SCC def. for (MachineInstr &MI : llvm::make_range(MachineBasicBlock::iterator(SCCDefInst), SCCDefInst.getParent()->end())) { // Exit if we find another SCC def. if (MI.findRegisterDefOperandIdx(AMDGPU::SCC) != -1) return; if (MI.findRegisterUseOperandIdx(AMDGPU::SCC) != -1) Worklist.push_back(&MI); } } const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass( const MachineInstr &Inst) const { const TargetRegisterClass *NewDstRC = getOpRegClass(Inst, 0); switch (Inst.getOpcode()) { // For target instructions, getOpRegClass just returns the virtual register // class associated with the operand, so we need to find an equivalent VGPR // register class in order to move the instruction to the VALU. case AMDGPU::COPY: case AMDGPU::PHI: case AMDGPU::REG_SEQUENCE: case AMDGPU::INSERT_SUBREG: if (RI.hasVGPRs(NewDstRC)) return nullptr; NewDstRC = RI.getEquivalentVGPRClass(NewDstRC); if (!NewDstRC) return nullptr; return NewDstRC; default: return NewDstRC; } } // Find the one SGPR operand we are allowed to use. unsigned SIInstrInfo::findUsedSGPR(const MachineInstr &MI, int OpIndices[3]) const { const MCInstrDesc &Desc = MI.getDesc(); // Find the one SGPR operand we are allowed to use. // // First we need to consider the instruction's operand requirements before // legalizing. Some operands are required to be SGPRs, such as implicit uses // of VCC, but we are still bound by the constant bus requirement to only use // one. // // If the operand's class is an SGPR, we can never move it. unsigned SGPRReg = findImplicitSGPRRead(MI); if (SGPRReg != AMDGPU::NoRegister) return SGPRReg; unsigned UsedSGPRs[3] = { AMDGPU::NoRegister }; const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); for (unsigned i = 0; i < 3; ++i) { int Idx = OpIndices[i]; if (Idx == -1) break; const MachineOperand &MO = MI.getOperand(Idx); if (!MO.isReg()) continue; // Is this operand statically required to be an SGPR based on the operand // constraints? const TargetRegisterClass *OpRC = RI.getRegClass(Desc.OpInfo[Idx].RegClass); bool IsRequiredSGPR = RI.isSGPRClass(OpRC); if (IsRequiredSGPR) return MO.getReg(); // If this could be a VGPR or an SGPR, Check the dynamic register class. unsigned Reg = MO.getReg(); const TargetRegisterClass *RegRC = MRI.getRegClass(Reg); if (RI.isSGPRClass(RegRC)) UsedSGPRs[i] = Reg; } // We don't have a required SGPR operand, so we have a bit more freedom in // selecting operands to move. // Try to select the most used SGPR. If an SGPR is equal to one of the // others, we choose that. // // e.g. // V_FMA_F32 v0, s0, s0, s0 -> No moves // V_FMA_F32 v0, s0, s1, s0 -> Move s1 // TODO: If some of the operands are 64-bit SGPRs and some 32, we should // prefer those. if (UsedSGPRs[0] != AMDGPU::NoRegister) { if (UsedSGPRs[0] == UsedSGPRs[1] || UsedSGPRs[0] == UsedSGPRs[2]) SGPRReg = UsedSGPRs[0]; } if (SGPRReg == AMDGPU::NoRegister && UsedSGPRs[1] != AMDGPU::NoRegister) { if (UsedSGPRs[1] == UsedSGPRs[2]) SGPRReg = UsedSGPRs[1]; } return SGPRReg; } MachineOperand *SIInstrInfo::getNamedOperand(MachineInstr &MI, unsigned OperandName) const { int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OperandName); if (Idx == -1) return nullptr; return &MI.getOperand(Idx); } uint64_t SIInstrInfo::getDefaultRsrcDataFormat() const { uint64_t RsrcDataFormat = AMDGPU::RSRC_DATA_FORMAT; if (ST.isAmdHsaOS()) { RsrcDataFormat |= (1ULL << 56); if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) // Set MTYPE = 2 RsrcDataFormat |= (2ULL << 59); } return RsrcDataFormat; } uint64_t SIInstrInfo::getScratchRsrcWords23() const { uint64_t Rsrc23 = getDefaultRsrcDataFormat() | AMDGPU::RSRC_TID_ENABLE | 0xffffffff; // Size; uint64_t EltSizeValue = Log2_32(ST.getMaxPrivateElementSize()) - 1; Rsrc23 |= (EltSizeValue << AMDGPU::RSRC_ELEMENT_SIZE_SHIFT) | // IndexStride = 64 (UINT64_C(3) << AMDGPU::RSRC_INDEX_STRIDE_SHIFT); // If TID_ENABLE is set, DATA_FORMAT specifies stride bits [14:17]. // Clear them unless we want a huge stride. if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) Rsrc23 &= ~AMDGPU::RSRC_DATA_FORMAT; return Rsrc23; } bool SIInstrInfo::isLowLatencyInstruction(const MachineInstr &MI) const { unsigned Opc = MI.getOpcode(); return isSMRD(Opc); } bool SIInstrInfo::isHighLatencyInstruction(const MachineInstr &MI) const { unsigned Opc = MI.getOpcode(); return isMUBUF(Opc) || isMTBUF(Opc) || isMIMG(Opc); } unsigned SIInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const { unsigned Opc = MI.getOpcode(); const MCInstrDesc &Desc = getMCOpcodeFromPseudo(Opc); unsigned DescSize = Desc.getSize(); // If we have a definitive size, we can use it. Otherwise we need to inspect // the operands to know the size. if (DescSize == 8 || DescSize == 4) return DescSize; assert(DescSize == 0); // 4-byte instructions may have a 32-bit literal encoded after them. Check // operands that coud ever be literals. if (isVALU(MI) || isSALU(MI)) { int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); if (Src0Idx == -1) return 4; // No operands. if (isLiteralConstant(MI.getOperand(Src0Idx), getOpSize(MI, Src0Idx))) return 8; int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1); if (Src1Idx == -1) return 4; if (isLiteralConstant(MI.getOperand(Src1Idx), getOpSize(MI, Src1Idx))) return 8; return 4; } switch (Opc) { case TargetOpcode::IMPLICIT_DEF: case TargetOpcode::KILL: case TargetOpcode::DBG_VALUE: case TargetOpcode::BUNDLE: case TargetOpcode::EH_LABEL: return 0; case TargetOpcode::INLINEASM: { const MachineFunction *MF = MI.getParent()->getParent(); const char *AsmStr = MI.getOperand(0).getSymbolName(); return getInlineAsmLength(AsmStr, *MF->getTarget().getMCAsmInfo()); } default: llvm_unreachable("unable to find instruction size"); } } ArrayRef> SIInstrInfo::getSerializableTargetIndices() const { static const std::pair TargetIndices[] = { {AMDGPU::TI_CONSTDATA_START, "amdgpu-constdata-start"}, {AMDGPU::TI_SCRATCH_RSRC_DWORD0, "amdgpu-scratch-rsrc-dword0"}, {AMDGPU::TI_SCRATCH_RSRC_DWORD1, "amdgpu-scratch-rsrc-dword1"}, {AMDGPU::TI_SCRATCH_RSRC_DWORD2, "amdgpu-scratch-rsrc-dword2"}, {AMDGPU::TI_SCRATCH_RSRC_DWORD3, "amdgpu-scratch-rsrc-dword3"}}; return makeArrayRef(TargetIndices); } /// This is used by the post-RA scheduler (SchedulePostRAList.cpp). The /// post-RA version of misched uses CreateTargetMIHazardRecognizer. ScheduleHazardRecognizer * SIInstrInfo::CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II, const ScheduleDAG *DAG) const { return new GCNHazardRecognizer(DAG->MF); } /// This is the hazard recognizer used at -O0 by the PostRAHazardRecognizer /// pass. ScheduleHazardRecognizer * SIInstrInfo::CreateTargetPostRAHazardRecognizer(const MachineFunction &MF) const { return new GCNHazardRecognizer(MF); } Index: projects/clang391-import/contrib/llvm/lib/Target/AMDGPU/SIInstructions.td =================================================================== --- projects/clang391-import/contrib/llvm/lib/Target/AMDGPU/SIInstructions.td (revision 309436) +++ projects/clang391-import/contrib/llvm/lib/Target/AMDGPU/SIInstructions.td (revision 309437) @@ -1,3561 +1,3562 @@ //===-- SIInstructions.td - SI Instruction Defintions ---------------------===// // // The LLVM Compiler Infrastructure // // This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// // This file was originally auto-generated from a GPU register header file and // all the instruction definitions were originally commented out. Instructions // that are not yet supported remain commented out. //===----------------------------------------------------------------------===// class InterpSlots { int P0 = 2; int P10 = 0; int P20 = 1; } def INTERP : InterpSlots; def isGCN : Predicate<"Subtarget->getGeneration() " ">= SISubtarget::SOUTHERN_ISLANDS">, AssemblerPredicate<"FeatureGCN">; def isSI : Predicate<"Subtarget->getGeneration() " "== SISubtarget::SOUTHERN_ISLANDS">, AssemblerPredicate<"FeatureSouthernIslands">; def has16BankLDS : Predicate<"Subtarget->getLDSBankCount() == 16">; def has32BankLDS : Predicate<"Subtarget->getLDSBankCount() == 32">; let SubtargetPredicate = isGCN in { //===----------------------------------------------------------------------===// // EXP Instructions //===----------------------------------------------------------------------===// defm EXP : EXP_m; //===----------------------------------------------------------------------===// // SMRD Instructions //===----------------------------------------------------------------------===// // We are using the SReg_32_XM0 and not the SReg_32 register class for 32-bit // SMRD instructions, because the SReg_32_XM0 register class does not include M0 // and writing to M0 from an SMRD instruction will hang the GPU. defm S_LOAD_DWORD : SMRD_Helper , "s_load_dword", SReg_64, SReg_32_XM0>; defm S_LOAD_DWORDX2 : SMRD_Helper , "s_load_dwordx2", SReg_64, SReg_64>; defm S_LOAD_DWORDX4 : SMRD_Helper , "s_load_dwordx4", SReg_64, SReg_128>; defm S_LOAD_DWORDX8 : SMRD_Helper , "s_load_dwordx8", SReg_64, SReg_256>; defm S_LOAD_DWORDX16 : SMRD_Helper , "s_load_dwordx16", SReg_64, SReg_512>; defm S_BUFFER_LOAD_DWORD : SMRD_Helper < smrd<0x08>, "s_buffer_load_dword", SReg_128, SReg_32_XM0 >; defm S_BUFFER_LOAD_DWORDX2 : SMRD_Helper < smrd<0x09>, "s_buffer_load_dwordx2", SReg_128, SReg_64 >; defm S_BUFFER_LOAD_DWORDX4 : SMRD_Helper < smrd<0x0a>, "s_buffer_load_dwordx4", SReg_128, SReg_128 >; defm S_BUFFER_LOAD_DWORDX8 : SMRD_Helper < smrd<0x0b>, "s_buffer_load_dwordx8", SReg_128, SReg_256 >; defm S_BUFFER_LOAD_DWORDX16 : SMRD_Helper < smrd<0x0c>, "s_buffer_load_dwordx16", SReg_128, SReg_512 >; let mayStore = ? in { // FIXME: mayStore = ? is a workaround for tablegen bug for different // inferred mayStore flags for the instruction pattern vs. standalone // Pat. Each considers the other contradictory. defm S_MEMTIME : SMRD_Special , "s_memtime", (outs SReg_64:$sdst), ?, " $sdst", [(set i64:$sdst, (int_amdgcn_s_memtime))] >; } defm S_DCACHE_INV : SMRD_Inval , "s_dcache_inv", int_amdgcn_s_dcache_inv>; //===----------------------------------------------------------------------===// // SOP1 Instructions //===----------------------------------------------------------------------===// let isMoveImm = 1 in { let isReMaterializable = 1, isAsCheapAsAMove = 1 in { defm S_MOV_B32 : SOP1_32 , "s_mov_b32", []>; defm S_MOV_B64 : SOP1_64 , "s_mov_b64", []>; } // End isRematerializeable = 1 let Uses = [SCC] in { defm S_CMOV_B32 : SOP1_32 , "s_cmov_b32", []>; defm S_CMOV_B64 : SOP1_64 , "s_cmov_b64", []>; } // End Uses = [SCC] } // End isMoveImm = 1 let Defs = [SCC] in { defm S_NOT_B32 : SOP1_32 , "s_not_b32", [(set i32:$sdst, (not i32:$src0))] >; defm S_NOT_B64 : SOP1_64 , "s_not_b64", [(set i64:$sdst, (not i64:$src0))] >; defm S_WQM_B32 : SOP1_32 , "s_wqm_b32", []>; defm S_WQM_B64 : SOP1_64 , "s_wqm_b64", []>; } // End Defs = [SCC] defm S_BREV_B32 : SOP1_32 , "s_brev_b32", [(set i32:$sdst, (bitreverse i32:$src0))] >; defm S_BREV_B64 : SOP1_64 , "s_brev_b64", []>; let Defs = [SCC] in { defm S_BCNT0_I32_B32 : SOP1_32 , "s_bcnt0_i32_b32", []>; defm S_BCNT0_I32_B64 : SOP1_32_64 , "s_bcnt0_i32_b64", []>; defm S_BCNT1_I32_B32 : SOP1_32 , "s_bcnt1_i32_b32", [(set i32:$sdst, (ctpop i32:$src0))] >; defm S_BCNT1_I32_B64 : SOP1_32_64 , "s_bcnt1_i32_b64", []>; } // End Defs = [SCC] defm S_FF0_I32_B32 : SOP1_32 , "s_ff0_i32_b32", []>; defm S_FF0_I32_B64 : SOP1_32_64 , "s_ff0_i32_b64", []>; defm S_FF1_I32_B32 : SOP1_32 , "s_ff1_i32_b32", [(set i32:$sdst, (cttz_zero_undef i32:$src0))] >; defm S_FF1_I32_B64 : SOP1_32_64 , "s_ff1_i32_b64", []>; defm S_FLBIT_I32_B32 : SOP1_32 , "s_flbit_i32_b32", [(set i32:$sdst, (AMDGPUffbh_u32 i32:$src0))] >; defm S_FLBIT_I32_B64 : SOP1_32_64 , "s_flbit_i32_b64", []>; defm S_FLBIT_I32 : SOP1_32 , "s_flbit_i32", [(set i32:$sdst, (int_AMDGPU_flbit_i32 i32:$src0))] >; defm S_FLBIT_I32_I64 : SOP1_32_64 , "s_flbit_i32_i64", []>; defm S_SEXT_I32_I8 : SOP1_32 , "s_sext_i32_i8", [(set i32:$sdst, (sext_inreg i32:$src0, i8))] >; defm S_SEXT_I32_I16 : SOP1_32 , "s_sext_i32_i16", [(set i32:$sdst, (sext_inreg i32:$src0, i16))] >; defm S_BITSET0_B32 : SOP1_32 , "s_bitset0_b32", []>; defm S_BITSET0_B64 : SOP1_64_32 , "s_bitset0_b64", []>; defm S_BITSET1_B32 : SOP1_32 , "s_bitset1_b32", []>; defm S_BITSET1_B64 : SOP1_64_32 , "s_bitset1_b64", []>; defm S_GETPC_B64 : SOP1_64_0 , "s_getpc_b64", []>; defm S_SETPC_B64 : SOP1_1 , "s_setpc_b64", []>; defm S_SWAPPC_B64 : SOP1_64 , "s_swappc_b64", []>; defm S_RFE_B64 : SOP1_1 , "s_rfe_b64", []>; let hasSideEffects = 1, Uses = [EXEC], Defs = [EXEC, SCC] in { defm S_AND_SAVEEXEC_B64 : SOP1_64 , "s_and_saveexec_b64", []>; defm S_OR_SAVEEXEC_B64 : SOP1_64 , "s_or_saveexec_b64", []>; defm S_XOR_SAVEEXEC_B64 : SOP1_64 , "s_xor_saveexec_b64", []>; defm S_ANDN2_SAVEEXEC_B64 : SOP1_64 , "s_andn2_saveexec_b64", []>; defm S_ORN2_SAVEEXEC_B64 : SOP1_64 , "s_orn2_saveexec_b64", []>; defm S_NAND_SAVEEXEC_B64 : SOP1_64 , "s_nand_saveexec_b64", []>; defm S_NOR_SAVEEXEC_B64 : SOP1_64 , "s_nor_saveexec_b64", []>; defm S_XNOR_SAVEEXEC_B64 : SOP1_64 , "s_xnor_saveexec_b64", []>; } // End hasSideEffects = 1, Uses = [EXEC], Defs = [EXEC, SCC] defm S_QUADMASK_B32 : SOP1_32 , "s_quadmask_b32", []>; defm S_QUADMASK_B64 : SOP1_64 , "s_quadmask_b64", []>; let Uses = [M0] in { defm S_MOVRELS_B32 : SOP1_32 , "s_movrels_b32", []>; defm S_MOVRELS_B64 : SOP1_64 , "s_movrels_b64", []>; defm S_MOVRELD_B32 : SOP1_32 , "s_movreld_b32", []>; defm S_MOVRELD_B64 : SOP1_64 , "s_movreld_b64", []>; } // End Uses = [M0] defm S_CBRANCH_JOIN : SOP1_1 , "s_cbranch_join", []>; defm S_MOV_REGRD_B32 : SOP1_32 , "s_mov_regrd_b32", []>; let Defs = [SCC] in { defm S_ABS_I32 : SOP1_32 , "s_abs_i32", []>; } // End Defs = [SCC] defm S_MOV_FED_B32 : SOP1_32 , "s_mov_fed_b32", []>; //===----------------------------------------------------------------------===// // SOP2 Instructions //===----------------------------------------------------------------------===// let Defs = [SCC] in { // Carry out goes to SCC let isCommutable = 1 in { defm S_ADD_U32 : SOP2_32 , "s_add_u32", []>; defm S_ADD_I32 : SOP2_32 , "s_add_i32", [(set i32:$sdst, (add SSrc_32:$src0, SSrc_32:$src1))] >; } // End isCommutable = 1 defm S_SUB_U32 : SOP2_32 , "s_sub_u32", []>; defm S_SUB_I32 : SOP2_32 , "s_sub_i32", [(set i32:$sdst, (sub SSrc_32:$src0, SSrc_32:$src1))] >; let Uses = [SCC] in { // Carry in comes from SCC let isCommutable = 1 in { defm S_ADDC_U32 : SOP2_32 , "s_addc_u32", [(set i32:$sdst, (adde (i32 SSrc_32:$src0), (i32 SSrc_32:$src1)))]>; } // End isCommutable = 1 defm S_SUBB_U32 : SOP2_32 , "s_subb_u32", [(set i32:$sdst, (sube (i32 SSrc_32:$src0), (i32 SSrc_32:$src1)))]>; } // End Uses = [SCC] defm S_MIN_I32 : SOP2_32 , "s_min_i32", [(set i32:$sdst, (smin i32:$src0, i32:$src1))] >; defm S_MIN_U32 : SOP2_32 , "s_min_u32", [(set i32:$sdst, (umin i32:$src0, i32:$src1))] >; defm S_MAX_I32 : SOP2_32 , "s_max_i32", [(set i32:$sdst, (smax i32:$src0, i32:$src1))] >; defm S_MAX_U32 : SOP2_32 , "s_max_u32", [(set i32:$sdst, (umax i32:$src0, i32:$src1))] >; } // End Defs = [SCC] let Uses = [SCC] in { defm S_CSELECT_B32 : SOP2_32 , "s_cselect_b32", []>; defm S_CSELECT_B64 : SOP2_64 , "s_cselect_b64", []>; } // End Uses = [SCC] let Defs = [SCC] in { defm S_AND_B32 : SOP2_32 , "s_and_b32", [(set i32:$sdst, (and i32:$src0, i32:$src1))] >; defm S_AND_B64 : SOP2_64 , "s_and_b64", [(set i64:$sdst, (and i64:$src0, i64:$src1))] >; defm S_OR_B32 : SOP2_32 , "s_or_b32", [(set i32:$sdst, (or i32:$src0, i32:$src1))] >; defm S_OR_B64 : SOP2_64 , "s_or_b64", [(set i64:$sdst, (or i64:$src0, i64:$src1))] >; defm S_XOR_B32 : SOP2_32 , "s_xor_b32", [(set i32:$sdst, (xor i32:$src0, i32:$src1))] >; defm S_XOR_B64 : SOP2_64 , "s_xor_b64", [(set i64:$sdst, (xor i64:$src0, i64:$src1))] >; defm S_ANDN2_B32 : SOP2_32 , "s_andn2_b32", []>; defm S_ANDN2_B64 : SOP2_64 , "s_andn2_b64", []>; defm S_ORN2_B32 : SOP2_32 , "s_orn2_b32", []>; defm S_ORN2_B64 : SOP2_64 , "s_orn2_b64", []>; defm S_NAND_B32 : SOP2_32 , "s_nand_b32", []>; defm S_NAND_B64 : SOP2_64 , "s_nand_b64", []>; defm S_NOR_B32 : SOP2_32 , "s_nor_b32", []>; defm S_NOR_B64 : SOP2_64 , "s_nor_b64", []>; defm S_XNOR_B32 : SOP2_32 , "s_xnor_b32", []>; defm S_XNOR_B64 : SOP2_64 , "s_xnor_b64", []>; } // End Defs = [SCC] // Use added complexity so these patterns are preferred to the VALU patterns. let AddedComplexity = 1 in { let Defs = [SCC] in { defm S_LSHL_B32 : SOP2_32 , "s_lshl_b32", [(set i32:$sdst, (shl i32:$src0, i32:$src1))] >; defm S_LSHL_B64 : SOP2_64_32 , "s_lshl_b64", [(set i64:$sdst, (shl i64:$src0, i32:$src1))] >; defm S_LSHR_B32 : SOP2_32 , "s_lshr_b32", [(set i32:$sdst, (srl i32:$src0, i32:$src1))] >; defm S_LSHR_B64 : SOP2_64_32 , "s_lshr_b64", [(set i64:$sdst, (srl i64:$src0, i32:$src1))] >; defm S_ASHR_I32 : SOP2_32 , "s_ashr_i32", [(set i32:$sdst, (sra i32:$src0, i32:$src1))] >; defm S_ASHR_I64 : SOP2_64_32 , "s_ashr_i64", [(set i64:$sdst, (sra i64:$src0, i32:$src1))] >; } // End Defs = [SCC] defm S_BFM_B32 : SOP2_32 , "s_bfm_b32", [(set i32:$sdst, (AMDGPUbfm i32:$src0, i32:$src1))]>; defm S_BFM_B64 : SOP2_64_32_32 , "s_bfm_b64", []>; defm S_MUL_I32 : SOP2_32 , "s_mul_i32", [(set i32:$sdst, (mul i32:$src0, i32:$src1))] >; } // End AddedComplexity = 1 let Defs = [SCC] in { defm S_BFE_U32 : SOP2_32 , "s_bfe_u32", []>; defm S_BFE_I32 : SOP2_32 , "s_bfe_i32", []>; defm S_BFE_U64 : SOP2_64_32 , "s_bfe_u64", []>; defm S_BFE_I64 : SOP2_64_32 , "s_bfe_i64", []>; } // End Defs = [SCC] let sdst = 0 in { defm S_CBRANCH_G_FORK : SOP2_m < sop2<0x2b, 0x29>, "s_cbranch_g_fork", (outs), (ins SReg_64:$src0, SReg_64:$src1), "s_cbranch_g_fork $src0, $src1", [] >; } let Defs = [SCC] in { defm S_ABSDIFF_I32 : SOP2_32 , "s_absdiff_i32", []>; } // End Defs = [SCC] //===----------------------------------------------------------------------===// // SOPC Instructions //===----------------------------------------------------------------------===// def S_CMP_EQ_I32 : SOPC_CMP_32 <0x00000000, "s_cmp_eq_i32", COND_EQ>; def S_CMP_LG_I32 : SOPC_CMP_32 <0x00000001, "s_cmp_lg_i32", COND_NE>; def S_CMP_GT_I32 : SOPC_CMP_32 <0x00000002, "s_cmp_gt_i32", COND_SGT>; def S_CMP_GE_I32 : SOPC_CMP_32 <0x00000003, "s_cmp_ge_i32", COND_SGE>; def S_CMP_LT_I32 : SOPC_CMP_32 <0x00000004, "s_cmp_lt_i32", COND_SLT>; def S_CMP_LE_I32 : SOPC_CMP_32 <0x00000005, "s_cmp_le_i32", COND_SLE>; def S_CMP_EQ_U32 : SOPC_CMP_32 <0x00000006, "s_cmp_eq_u32", COND_EQ>; def S_CMP_LG_U32 : SOPC_CMP_32 <0x00000007, "s_cmp_lg_u32", COND_NE >; def S_CMP_GT_U32 : SOPC_CMP_32 <0x00000008, "s_cmp_gt_u32", COND_UGT>; def S_CMP_GE_U32 : SOPC_CMP_32 <0x00000009, "s_cmp_ge_u32", COND_UGE>; def S_CMP_LT_U32 : SOPC_CMP_32 <0x0000000a, "s_cmp_lt_u32", COND_ULT>; def S_CMP_LE_U32 : SOPC_CMP_32 <0x0000000b, "s_cmp_le_u32", COND_ULE>; def S_BITCMP0_B32 : SOPC_32 <0x0000000c, "s_bitcmp0_b32">; def S_BITCMP1_B32 : SOPC_32 <0x0000000d, "s_bitcmp1_b32">; def S_BITCMP0_B64 : SOPC_64_32 <0x0000000e, "s_bitcmp0_b64">; def S_BITCMP1_B64 : SOPC_64_32 <0x0000000f, "s_bitcmp1_b64">; def S_SETVSKIP : SOPC_32 <0x00000010, "s_setvskip">; //===----------------------------------------------------------------------===// // SOPK Instructions //===----------------------------------------------------------------------===// let isReMaterializable = 1, isMoveImm = 1 in { defm S_MOVK_I32 : SOPK_32 , "s_movk_i32", []>; } // End isReMaterializable = 1 let Uses = [SCC] in { defm S_CMOVK_I32 : SOPK_32 , "s_cmovk_i32", []>; } let isCompare = 1 in { /* This instruction is disabled for now until we can figure out how to teach the instruction selector to correctly use the S_CMP* vs V_CMP* instructions. When this instruction is enabled the code generator sometimes produces this invalid sequence: SCC = S_CMPK_EQ_I32 SGPR0, imm VCC = COPY SCC VGPR0 = V_CNDMASK VCC, VGPR0, VGPR1 defm S_CMPK_EQ_I32 : SOPK_SCC , "s_cmpk_eq_i32", [(set i1:$dst, (setcc i32:$src0, imm:$src1, SETEQ))] >; */ defm S_CMPK_EQ_I32 : SOPK_SCC , "s_cmpk_eq_i32", []>; defm S_CMPK_LG_I32 : SOPK_SCC , "s_cmpk_lg_i32", []>; defm S_CMPK_GT_I32 : SOPK_SCC , "s_cmpk_gt_i32", []>; defm S_CMPK_GE_I32 : SOPK_SCC , "s_cmpk_ge_i32", []>; defm S_CMPK_LT_I32 : SOPK_SCC , "s_cmpk_lt_i32", []>; defm S_CMPK_LE_I32 : SOPK_SCC , "s_cmpk_le_i32", []>; defm S_CMPK_EQ_U32 : SOPK_SCC , "s_cmpk_eq_u32", []>; defm S_CMPK_LG_U32 : SOPK_SCC , "s_cmpk_lg_u32", []>; defm S_CMPK_GT_U32 : SOPK_SCC , "s_cmpk_gt_u32", []>; defm S_CMPK_GE_U32 : SOPK_SCC , "s_cmpk_ge_u32", []>; defm S_CMPK_LT_U32 : SOPK_SCC , "s_cmpk_lt_u32", []>; defm S_CMPK_LE_U32 : SOPK_SCC , "s_cmpk_le_u32", []>; } // End isCompare = 1 let Defs = [SCC], isCommutable = 1, DisableEncoding = "$src0", Constraints = "$sdst = $src0" in { defm S_ADDK_I32 : SOPK_32TIE , "s_addk_i32", []>; defm S_MULK_I32 : SOPK_32TIE , "s_mulk_i32", []>; } defm S_CBRANCH_I_FORK : SOPK_m < sopk<0x11, 0x10>, "s_cbranch_i_fork", (outs), (ins SReg_64:$sdst, u16imm:$simm16), " $sdst, $simm16" >; let mayLoad = 1 in { defm S_GETREG_B32 : SOPK_m < sopk<0x12, 0x11>, "s_getreg_b32", (outs SReg_32:$sdst), (ins hwreg:$simm16), " $sdst, $simm16" >; } defm S_SETREG_B32 : SOPK_m < sopk<0x13, 0x12>, "s_setreg_b32", (outs), (ins SReg_32:$sdst, hwreg:$simm16), " $simm16, $sdst" >; // FIXME: Not on SI? //defm S_GETREG_REGRD_B32 : SOPK_32 , "s_getreg_regrd_b32", []>; defm S_SETREG_IMM32_B32 : SOPK_IMM32 < sopk<0x15, 0x14>, "s_setreg_imm32_b32", (outs), (ins i32imm:$imm, hwreg:$simm16), " $simm16, $imm" >; //===----------------------------------------------------------------------===// // SOPP Instructions //===----------------------------------------------------------------------===// def S_NOP : SOPP <0x00000000, (ins i16imm:$simm16), "s_nop $simm16">; let isTerminator = 1 in { def S_ENDPGM : SOPP <0x00000001, (ins), "s_endpgm", [(AMDGPUendpgm)]> { let simm16 = 0; let isBarrier = 1; let hasCtrlDep = 1; let hasSideEffects = 1; } let isBranch = 1 in { def S_BRANCH : SOPP < 0x00000002, (ins sopp_brtarget:$simm16), "s_branch $simm16", [(br bb:$simm16)]> { let isBarrier = 1; } let Uses = [SCC] in { def S_CBRANCH_SCC0 : SOPP < 0x00000004, (ins sopp_brtarget:$simm16), "s_cbranch_scc0 $simm16" >; def S_CBRANCH_SCC1 : SOPP < 0x00000005, (ins sopp_brtarget:$simm16), "s_cbranch_scc1 $simm16", [(si_uniform_br_scc SCC, bb:$simm16)] >; } // End Uses = [SCC] let Uses = [VCC] in { def S_CBRANCH_VCCZ : SOPP < 0x00000006, (ins sopp_brtarget:$simm16), "s_cbranch_vccz $simm16" >; def S_CBRANCH_VCCNZ : SOPP < 0x00000007, (ins sopp_brtarget:$simm16), "s_cbranch_vccnz $simm16" >; } // End Uses = [VCC] let Uses = [EXEC] in { def S_CBRANCH_EXECZ : SOPP < 0x00000008, (ins sopp_brtarget:$simm16), "s_cbranch_execz $simm16" >; def S_CBRANCH_EXECNZ : SOPP < 0x00000009, (ins sopp_brtarget:$simm16), "s_cbranch_execnz $simm16" >; } // End Uses = [EXEC] } // End isBranch = 1 } // End isTerminator = 1 let hasSideEffects = 1 in { def S_BARRIER : SOPP <0x0000000a, (ins), "s_barrier", [(int_amdgcn_s_barrier)] > { let SchedRW = [WriteBarrier]; let simm16 = 0; let mayLoad = 1; let mayStore = 1; let isConvergent = 1; } let mayLoad = 1, mayStore = 1, hasSideEffects = 1 in def S_WAITCNT : SOPP <0x0000000c, (ins WAIT_FLAG:$simm16), "s_waitcnt $simm16">; def S_SETHALT : SOPP <0x0000000d, (ins i16imm:$simm16), "s_sethalt $simm16">; // On SI the documentation says sleep for approximately 64 * low 2 // bits, consistent with the reported maximum of 448. On VI the // maximum reported is 960 cycles, so 960 / 64 = 15 max, so is the // maximum really 15 on VI? def S_SLEEP : SOPP <0x0000000e, (ins i32imm:$simm16), "s_sleep $simm16", [(int_amdgcn_s_sleep SIMM16bit:$simm16)]> { let hasSideEffects = 1; let mayLoad = 1; let mayStore = 1; } def S_SETPRIO : SOPP <0x0000000f, (ins i16imm:$simm16), "s_setprio $simm16">; let Uses = [EXEC, M0] in { // FIXME: Should this be mayLoad+mayStore? def S_SENDMSG : SOPP <0x00000010, (ins SendMsgImm:$simm16), "s_sendmsg $simm16", [(AMDGPUsendmsg (i32 imm:$simm16))] >; } // End Uses = [EXEC, M0] def S_SENDMSGHALT : SOPP <0x00000011, (ins SendMsgImm:$simm16), "s_sendmsghalt $simm16">; def S_TRAP : SOPP <0x00000012, (ins i16imm:$simm16), "s_trap $simm16">; def S_ICACHE_INV : SOPP <0x00000013, (ins), "s_icache_inv"> { let simm16 = 0; } def S_INCPERFLEVEL : SOPP <0x00000014, (ins i16imm:$simm16), "s_incperflevel $simm16">; def S_DECPERFLEVEL : SOPP <0x00000015, (ins i16imm:$simm16), "s_decperflevel $simm16">; def S_TTRACEDATA : SOPP <0x00000016, (ins), "s_ttracedata"> { let simm16 = 0; } } // End hasSideEffects //===----------------------------------------------------------------------===// // VOPC Instructions //===----------------------------------------------------------------------===// let isCompare = 1, isCommutable = 1 in { defm V_CMP_F_F32 : VOPC_F32 , "v_cmp_f_f32">; defm V_CMP_LT_F32 : VOPC_F32 , "v_cmp_lt_f32", COND_OLT, "v_cmp_gt_f32">; defm V_CMP_EQ_F32 : VOPC_F32 , "v_cmp_eq_f32", COND_OEQ>; defm V_CMP_LE_F32 : VOPC_F32 , "v_cmp_le_f32", COND_OLE, "v_cmp_ge_f32">; defm V_CMP_GT_F32 : VOPC_F32 , "v_cmp_gt_f32", COND_OGT>; defm V_CMP_LG_F32 : VOPC_F32 , "v_cmp_lg_f32", COND_ONE>; defm V_CMP_GE_F32 : VOPC_F32 , "v_cmp_ge_f32", COND_OGE>; defm V_CMP_O_F32 : VOPC_F32 , "v_cmp_o_f32", COND_O>; defm V_CMP_U_F32 : VOPC_F32 , "v_cmp_u_f32", COND_UO>; defm V_CMP_NGE_F32 : VOPC_F32 , "v_cmp_nge_f32", COND_ULT, "v_cmp_nle_f32">; defm V_CMP_NLG_F32 : VOPC_F32 , "v_cmp_nlg_f32", COND_UEQ>; defm V_CMP_NGT_F32 : VOPC_F32 , "v_cmp_ngt_f32", COND_ULE, "v_cmp_nlt_f32">; defm V_CMP_NLE_F32 : VOPC_F32 , "v_cmp_nle_f32", COND_UGT>; defm V_CMP_NEQ_F32 : VOPC_F32 , "v_cmp_neq_f32", COND_UNE>; defm V_CMP_NLT_F32 : VOPC_F32 , "v_cmp_nlt_f32", COND_UGE>; defm V_CMP_TRU_F32 : VOPC_F32 , "v_cmp_tru_f32">; defm V_CMPX_F_F32 : VOPCX_F32 , "v_cmpx_f_f32">; defm V_CMPX_LT_F32 : VOPCX_F32 , "v_cmpx_lt_f32", "v_cmpx_gt_f32">; defm V_CMPX_EQ_F32 : VOPCX_F32 , "v_cmpx_eq_f32">; defm V_CMPX_LE_F32 : VOPCX_F32 , "v_cmpx_le_f32", "v_cmpx_ge_f32">; defm V_CMPX_GT_F32 : VOPCX_F32 , "v_cmpx_gt_f32">; defm V_CMPX_LG_F32 : VOPCX_F32 , "v_cmpx_lg_f32">; defm V_CMPX_GE_F32 : VOPCX_F32 , "v_cmpx_ge_f32">; defm V_CMPX_O_F32 : VOPCX_F32 , "v_cmpx_o_f32">; defm V_CMPX_U_F32 : VOPCX_F32 , "v_cmpx_u_f32">; defm V_CMPX_NGE_F32 : VOPCX_F32 , "v_cmpx_nge_f32">; defm V_CMPX_NLG_F32 : VOPCX_F32 , "v_cmpx_nlg_f32">; defm V_CMPX_NGT_F32 : VOPCX_F32 , "v_cmpx_ngt_f32">; defm V_CMPX_NLE_F32 : VOPCX_F32 , "v_cmpx_nle_f32">; defm V_CMPX_NEQ_F32 : VOPCX_F32 , "v_cmpx_neq_f32">; defm V_CMPX_NLT_F32 : VOPCX_F32 , "v_cmpx_nlt_f32">; defm V_CMPX_TRU_F32 : VOPCX_F32 , "v_cmpx_tru_f32">; defm V_CMP_F_F64 : VOPC_F64 , "v_cmp_f_f64">; defm V_CMP_LT_F64 : VOPC_F64 , "v_cmp_lt_f64", COND_OLT, "v_cmp_gt_f64">; defm V_CMP_EQ_F64 : VOPC_F64 , "v_cmp_eq_f64", COND_OEQ>; defm V_CMP_LE_F64 : VOPC_F64 , "v_cmp_le_f64", COND_OLE, "v_cmp_ge_f64">; defm V_CMP_GT_F64 : VOPC_F64 , "v_cmp_gt_f64", COND_OGT>; defm V_CMP_LG_F64 : VOPC_F64 , "v_cmp_lg_f64", COND_ONE>; defm V_CMP_GE_F64 : VOPC_F64 , "v_cmp_ge_f64", COND_OGE>; defm V_CMP_O_F64 : VOPC_F64 , "v_cmp_o_f64", COND_O>; defm V_CMP_U_F64 : VOPC_F64 , "v_cmp_u_f64", COND_UO>; defm V_CMP_NGE_F64 : VOPC_F64 , "v_cmp_nge_f64", COND_ULT, "v_cmp_nle_f64">; defm V_CMP_NLG_F64 : VOPC_F64 , "v_cmp_nlg_f64", COND_UEQ>; defm V_CMP_NGT_F64 : VOPC_F64 , "v_cmp_ngt_f64", COND_ULE, "v_cmp_nlt_f64">; defm V_CMP_NLE_F64 : VOPC_F64 , "v_cmp_nle_f64", COND_UGT>; defm V_CMP_NEQ_F64 : VOPC_F64 , "v_cmp_neq_f64", COND_UNE>; defm V_CMP_NLT_F64 : VOPC_F64 , "v_cmp_nlt_f64", COND_UGE>; defm V_CMP_TRU_F64 : VOPC_F64 , "v_cmp_tru_f64">; defm V_CMPX_F_F64 : VOPCX_F64 , "v_cmpx_f_f64">; defm V_CMPX_LT_F64 : VOPCX_F64 , "v_cmpx_lt_f64", "v_cmpx_gt_f64">; defm V_CMPX_EQ_F64 : VOPCX_F64 , "v_cmpx_eq_f64">; defm V_CMPX_LE_F64 : VOPCX_F64 , "v_cmpx_le_f64", "v_cmpx_ge_f64">; defm V_CMPX_GT_F64 : VOPCX_F64 , "v_cmpx_gt_f64">; defm V_CMPX_LG_F64 : VOPCX_F64 , "v_cmpx_lg_f64">; defm V_CMPX_GE_F64 : VOPCX_F64 , "v_cmpx_ge_f64">; defm V_CMPX_O_F64 : VOPCX_F64 , "v_cmpx_o_f64">; defm V_CMPX_U_F64 : VOPCX_F64 , "v_cmpx_u_f64">; defm V_CMPX_NGE_F64 : VOPCX_F64 , "v_cmpx_nge_f64", "v_cmpx_nle_f64">; defm V_CMPX_NLG_F64 : VOPCX_F64 , "v_cmpx_nlg_f64">; defm V_CMPX_NGT_F64 : VOPCX_F64 , "v_cmpx_ngt_f64", "v_cmpx_nlt_f64">; defm V_CMPX_NLE_F64 : VOPCX_F64 , "v_cmpx_nle_f64">; defm V_CMPX_NEQ_F64 : VOPCX_F64 , "v_cmpx_neq_f64">; defm V_CMPX_NLT_F64 : VOPCX_F64 , "v_cmpx_nlt_f64">; defm V_CMPX_TRU_F64 : VOPCX_F64 , "v_cmpx_tru_f64">; let SubtargetPredicate = isSICI in { defm V_CMPS_F_F32 : VOPC_F32 , "v_cmps_f_f32">; defm V_CMPS_LT_F32 : VOPC_F32 , "v_cmps_lt_f32", COND_NULL, "v_cmps_gt_f32">; defm V_CMPS_EQ_F32 : VOPC_F32 , "v_cmps_eq_f32">; defm V_CMPS_LE_F32 : VOPC_F32 , "v_cmps_le_f32", COND_NULL, "v_cmps_ge_f32">; defm V_CMPS_GT_F32 : VOPC_F32 , "v_cmps_gt_f32">; defm V_CMPS_LG_F32 : VOPC_F32 , "v_cmps_lg_f32">; defm V_CMPS_GE_F32 : VOPC_F32 , "v_cmps_ge_f32">; defm V_CMPS_O_F32 : VOPC_F32 , "v_cmps_o_f32">; defm V_CMPS_U_F32 : VOPC_F32 , "v_cmps_u_f32">; defm V_CMPS_NGE_F32 : VOPC_F32 , "v_cmps_nge_f32", COND_NULL, "v_cmps_nle_f32">; defm V_CMPS_NLG_F32 : VOPC_F32 , "v_cmps_nlg_f32">; defm V_CMPS_NGT_F32 : VOPC_F32 , "v_cmps_ngt_f32", COND_NULL, "v_cmps_nlt_f32">; defm V_CMPS_NLE_F32 : VOPC_F32 , "v_cmps_nle_f32">; defm V_CMPS_NEQ_F32 : VOPC_F32 , "v_cmps_neq_f32">; defm V_CMPS_NLT_F32 : VOPC_F32 , "v_cmps_nlt_f32">; defm V_CMPS_TRU_F32 : VOPC_F32 , "v_cmps_tru_f32">; defm V_CMPSX_F_F32 : VOPCX_F32 , "v_cmpsx_f_f32">; defm V_CMPSX_LT_F32 : VOPCX_F32 , "v_cmpsx_lt_f32", "v_cmpsx_gt_f32">; defm V_CMPSX_EQ_F32 : VOPCX_F32 , "v_cmpsx_eq_f32">; defm V_CMPSX_LE_F32 : VOPCX_F32 , "v_cmpsx_le_f32", "v_cmpsx_ge_f32">; defm V_CMPSX_GT_F32 : VOPCX_F32 , "v_cmpsx_gt_f32">; defm V_CMPSX_LG_F32 : VOPCX_F32 , "v_cmpsx_lg_f32">; defm V_CMPSX_GE_F32 : VOPCX_F32 , "v_cmpsx_ge_f32">; defm V_CMPSX_O_F32 : VOPCX_F32 , "v_cmpsx_o_f32">; defm V_CMPSX_U_F32 : VOPCX_F32 , "v_cmpsx_u_f32">; defm V_CMPSX_NGE_F32 : VOPCX_F32 , "v_cmpsx_nge_f32", "v_cmpsx_nle_f32">; defm V_CMPSX_NLG_F32 : VOPCX_F32 , "v_cmpsx_nlg_f32">; defm V_CMPSX_NGT_F32 : VOPCX_F32 , "v_cmpsx_ngt_f32", "v_cmpsx_nlt_f32">; defm V_CMPSX_NLE_F32 : VOPCX_F32 , "v_cmpsx_nle_f32">; defm V_CMPSX_NEQ_F32 : VOPCX_F32 , "v_cmpsx_neq_f32">; defm V_CMPSX_NLT_F32 : VOPCX_F32 , "v_cmpsx_nlt_f32">; defm V_CMPSX_TRU_F32 : VOPCX_F32 , "v_cmpsx_tru_f32">; defm V_CMPS_F_F64 : VOPC_F64 , "v_cmps_f_f64">; defm V_CMPS_LT_F64 : VOPC_F64 , "v_cmps_lt_f64", COND_NULL, "v_cmps_gt_f64">; defm V_CMPS_EQ_F64 : VOPC_F64 , "v_cmps_eq_f64">; defm V_CMPS_LE_F64 : VOPC_F64 , "v_cmps_le_f64", COND_NULL, "v_cmps_ge_f64">; defm V_CMPS_GT_F64 : VOPC_F64 , "v_cmps_gt_f64">; defm V_CMPS_LG_F64 : VOPC_F64 , "v_cmps_lg_f64">; defm V_CMPS_GE_F64 : VOPC_F64 , "v_cmps_ge_f64">; defm V_CMPS_O_F64 : VOPC_F64 , "v_cmps_o_f64">; defm V_CMPS_U_F64 : VOPC_F64 , "v_cmps_u_f64">; defm V_CMPS_NGE_F64 : VOPC_F64 , "v_cmps_nge_f64", COND_NULL, "v_cmps_nle_f64">; defm V_CMPS_NLG_F64 : VOPC_F64 , "v_cmps_nlg_f64">; defm V_CMPS_NGT_F64 : VOPC_F64 , "v_cmps_ngt_f64", COND_NULL, "v_cmps_nlt_f64">; defm V_CMPS_NLE_F64 : VOPC_F64 , "v_cmps_nle_f64">; defm V_CMPS_NEQ_F64 : VOPC_F64 , "v_cmps_neq_f64">; defm V_CMPS_NLT_F64 : VOPC_F64 , "v_cmps_nlt_f64">; defm V_CMPS_TRU_F64 : VOPC_F64 , "v_cmps_tru_f64">; defm V_CMPSX_F_F64 : VOPCX_F64 , "v_cmpsx_f_f64">; defm V_CMPSX_LT_F64 : VOPCX_F64 , "v_cmpsx_lt_f64", "v_cmpsx_gt_f64">; defm V_CMPSX_EQ_F64 : VOPCX_F64 , "v_cmpsx_eq_f64">; defm V_CMPSX_LE_F64 : VOPCX_F64 , "v_cmpsx_le_f64", "v_cmpsx_ge_f64">; defm V_CMPSX_GT_F64 : VOPCX_F64 , "v_cmpsx_gt_f64">; defm V_CMPSX_LG_F64 : VOPCX_F64 , "v_cmpsx_lg_f64">; defm V_CMPSX_GE_F64 : VOPCX_F64 , "v_cmpsx_ge_f64">; defm V_CMPSX_O_F64 : VOPCX_F64 , "v_cmpsx_o_f64">; defm V_CMPSX_U_F64 : VOPCX_F64 , "v_cmpsx_u_f64">; defm V_CMPSX_NGE_F64 : VOPCX_F64 , "v_cmpsx_nge_f64", "v_cmpsx_nle_f64">; defm V_CMPSX_NLG_F64 : VOPCX_F64 , "v_cmpsx_nlg_f64">; defm V_CMPSX_NGT_F64 : VOPCX_F64 , "v_cmpsx_ngt_f64", "v_cmpsx_nlt_f64">; defm V_CMPSX_NLE_F64 : VOPCX_F64 , "v_cmpsx_nle_f64">; defm V_CMPSX_NEQ_F64 : VOPCX_F64 , "v_cmpsx_neq_f64">; defm V_CMPSX_NLT_F64 : VOPCX_F64 , "v_cmpsx_nlt_f64">; defm V_CMPSX_TRU_F64 : VOPCX_F64 , "v_cmpsx_tru_f64">; } // End SubtargetPredicate = isSICI defm V_CMP_F_I32 : VOPC_I32 , "v_cmp_f_i32">; defm V_CMP_LT_I32 : VOPC_I32 , "v_cmp_lt_i32", COND_SLT, "v_cmp_gt_i32">; defm V_CMP_EQ_I32 : VOPC_I32 , "v_cmp_eq_i32", COND_EQ>; defm V_CMP_LE_I32 : VOPC_I32 , "v_cmp_le_i32", COND_SLE, "v_cmp_ge_i32">; defm V_CMP_GT_I32 : VOPC_I32 , "v_cmp_gt_i32", COND_SGT>; defm V_CMP_NE_I32 : VOPC_I32 , "v_cmp_ne_i32", COND_NE>; defm V_CMP_GE_I32 : VOPC_I32 , "v_cmp_ge_i32", COND_SGE>; defm V_CMP_T_I32 : VOPC_I32 , "v_cmp_t_i32">; defm V_CMPX_F_I32 : VOPCX_I32 , "v_cmpx_f_i32">; defm V_CMPX_LT_I32 : VOPCX_I32 , "v_cmpx_lt_i32", "v_cmpx_gt_i32">; defm V_CMPX_EQ_I32 : VOPCX_I32 , "v_cmpx_eq_i32">; defm V_CMPX_LE_I32 : VOPCX_I32 , "v_cmpx_le_i32", "v_cmpx_ge_i32">; defm V_CMPX_GT_I32 : VOPCX_I32 , "v_cmpx_gt_i32">; defm V_CMPX_NE_I32 : VOPCX_I32 , "v_cmpx_ne_i32">; defm V_CMPX_GE_I32 : VOPCX_I32 , "v_cmpx_ge_i32">; defm V_CMPX_T_I32 : VOPCX_I32 , "v_cmpx_t_i32">; defm V_CMP_F_I64 : VOPC_I64 , "v_cmp_f_i64">; defm V_CMP_LT_I64 : VOPC_I64 , "v_cmp_lt_i64", COND_SLT, "v_cmp_gt_i64">; defm V_CMP_EQ_I64 : VOPC_I64 , "v_cmp_eq_i64", COND_EQ>; defm V_CMP_LE_I64 : VOPC_I64 , "v_cmp_le_i64", COND_SLE, "v_cmp_ge_i64">; defm V_CMP_GT_I64 : VOPC_I64 , "v_cmp_gt_i64", COND_SGT>; defm V_CMP_NE_I64 : VOPC_I64 , "v_cmp_ne_i64", COND_NE>; defm V_CMP_GE_I64 : VOPC_I64 , "v_cmp_ge_i64", COND_SGE>; defm V_CMP_T_I64 : VOPC_I64 , "v_cmp_t_i64">; defm V_CMPX_F_I64 : VOPCX_I64 , "v_cmpx_f_i64">; defm V_CMPX_LT_I64 : VOPCX_I64 , "v_cmpx_lt_i64", "v_cmpx_gt_i64">; defm V_CMPX_EQ_I64 : VOPCX_I64 , "v_cmpx_eq_i64">; defm V_CMPX_LE_I64 : VOPCX_I64 , "v_cmpx_le_i64", "v_cmpx_ge_i64">; defm V_CMPX_GT_I64 : VOPCX_I64 , "v_cmpx_gt_i64">; defm V_CMPX_NE_I64 : VOPCX_I64 , "v_cmpx_ne_i64">; defm V_CMPX_GE_I64 : VOPCX_I64 , "v_cmpx_ge_i64">; defm V_CMPX_T_I64 : VOPCX_I64 , "v_cmpx_t_i64">; defm V_CMP_F_U32 : VOPC_I32 , "v_cmp_f_u32">; defm V_CMP_LT_U32 : VOPC_I32 , "v_cmp_lt_u32", COND_ULT, "v_cmp_gt_u32">; defm V_CMP_EQ_U32 : VOPC_I32 , "v_cmp_eq_u32", COND_EQ>; defm V_CMP_LE_U32 : VOPC_I32 , "v_cmp_le_u32", COND_ULE, "v_cmp_ge_u32">; defm V_CMP_GT_U32 : VOPC_I32 , "v_cmp_gt_u32", COND_UGT>; defm V_CMP_NE_U32 : VOPC_I32 , "v_cmp_ne_u32", COND_NE>; defm V_CMP_GE_U32 : VOPC_I32 , "v_cmp_ge_u32", COND_UGE>; defm V_CMP_T_U32 : VOPC_I32 , "v_cmp_t_u32">; defm V_CMPX_F_U32 : VOPCX_I32 , "v_cmpx_f_u32">; defm V_CMPX_LT_U32 : VOPCX_I32 , "v_cmpx_lt_u32", "v_cmpx_gt_u32">; defm V_CMPX_EQ_U32 : VOPCX_I32 , "v_cmpx_eq_u32">; defm V_CMPX_LE_U32 : VOPCX_I32 , "v_cmpx_le_u32", "v_cmpx_le_u32">; defm V_CMPX_GT_U32 : VOPCX_I32 , "v_cmpx_gt_u32">; defm V_CMPX_NE_U32 : VOPCX_I32 , "v_cmpx_ne_u32">; defm V_CMPX_GE_U32 : VOPCX_I32 , "v_cmpx_ge_u32">; defm V_CMPX_T_U32 : VOPCX_I32 , "v_cmpx_t_u32">; defm V_CMP_F_U64 : VOPC_I64 , "v_cmp_f_u64">; defm V_CMP_LT_U64 : VOPC_I64 , "v_cmp_lt_u64", COND_ULT, "v_cmp_gt_u64">; defm V_CMP_EQ_U64 : VOPC_I64 , "v_cmp_eq_u64", COND_EQ>; defm V_CMP_LE_U64 : VOPC_I64 , "v_cmp_le_u64", COND_ULE, "v_cmp_ge_u64">; defm V_CMP_GT_U64 : VOPC_I64 , "v_cmp_gt_u64", COND_UGT>; defm V_CMP_NE_U64 : VOPC_I64 , "v_cmp_ne_u64", COND_NE>; defm V_CMP_GE_U64 : VOPC_I64 , "v_cmp_ge_u64", COND_UGE>; defm V_CMP_T_U64 : VOPC_I64 , "v_cmp_t_u64">; defm V_CMPX_F_U64 : VOPCX_I64 , "v_cmpx_f_u64">; defm V_CMPX_LT_U64 : VOPCX_I64 , "v_cmpx_lt_u64", "v_cmpx_gt_u64">; defm V_CMPX_EQ_U64 : VOPCX_I64 , "v_cmpx_eq_u64">; defm V_CMPX_LE_U64 : VOPCX_I64 , "v_cmpx_le_u64", "v_cmpx_ge_u64">; defm V_CMPX_GT_U64 : VOPCX_I64 , "v_cmpx_gt_u64">; defm V_CMPX_NE_U64 : VOPCX_I64 , "v_cmpx_ne_u64">; defm V_CMPX_GE_U64 : VOPCX_I64 , "v_cmpx_ge_u64">; defm V_CMPX_T_U64 : VOPCX_I64 , "v_cmpx_t_u64">; } // End isCompare = 1, isCommutable = 1 defm V_CMP_CLASS_F32 : VOPC_CLASS_F32 , "v_cmp_class_f32">; defm V_CMPX_CLASS_F32 : VOPCX_CLASS_F32 , "v_cmpx_class_f32">; defm V_CMP_CLASS_F64 : VOPC_CLASS_F64 , "v_cmp_class_f64">; defm V_CMPX_CLASS_F64 : VOPCX_CLASS_F64 , "v_cmpx_class_f64">; //===----------------------------------------------------------------------===// // DS Instructions //===----------------------------------------------------------------------===// defm DS_ADD_U32 : DS_1A1D_NORET <0x0, "ds_add_u32", VGPR_32>; defm DS_SUB_U32 : DS_1A1D_NORET <0x1, "ds_sub_u32", VGPR_32>; defm DS_RSUB_U32 : DS_1A1D_NORET <0x2, "ds_rsub_u32", VGPR_32>; defm DS_INC_U32 : DS_1A1D_NORET <0x3, "ds_inc_u32", VGPR_32>; defm DS_DEC_U32 : DS_1A1D_NORET <0x4, "ds_dec_u32", VGPR_32>; defm DS_MIN_I32 : DS_1A1D_NORET <0x5, "ds_min_i32", VGPR_32>; defm DS_MAX_I32 : DS_1A1D_NORET <0x6, "ds_max_i32", VGPR_32>; defm DS_MIN_U32 : DS_1A1D_NORET <0x7, "ds_min_u32", VGPR_32>; defm DS_MAX_U32 : DS_1A1D_NORET <0x8, "ds_max_u32", VGPR_32>; defm DS_AND_B32 : DS_1A1D_NORET <0x9, "ds_and_b32", VGPR_32>; defm DS_OR_B32 : DS_1A1D_NORET <0xa, "ds_or_b32", VGPR_32>; defm DS_XOR_B32 : DS_1A1D_NORET <0xb, "ds_xor_b32", VGPR_32>; defm DS_MSKOR_B32 : DS_1A2D_NORET <0xc, "ds_mskor_b32", VGPR_32>; let mayLoad = 0 in { defm DS_WRITE_B32 : DS_1A1D_NORET <0xd, "ds_write_b32", VGPR_32>; defm DS_WRITE2_B32 : DS_1A2D_Off8_NORET <0xe, "ds_write2_b32", VGPR_32>; defm DS_WRITE2ST64_B32 : DS_1A2D_Off8_NORET <0xf, "ds_write2st64_b32", VGPR_32>; } defm DS_CMPST_B32 : DS_1A2D_NORET <0x10, "ds_cmpst_b32", VGPR_32>; defm DS_CMPST_F32 : DS_1A2D_NORET <0x11, "ds_cmpst_f32", VGPR_32>; defm DS_MIN_F32 : DS_1A2D_NORET <0x12, "ds_min_f32", VGPR_32>; defm DS_MAX_F32 : DS_1A2D_NORET <0x13, "ds_max_f32", VGPR_32>; defm DS_GWS_INIT : DS_1A_GDS <0x19, "ds_gws_init">; defm DS_GWS_SEMA_V : DS_1A_GDS <0x1a, "ds_gws_sema_v">; defm DS_GWS_SEMA_BR : DS_1A_GDS <0x1b, "ds_gws_sema_br">; defm DS_GWS_SEMA_P : DS_1A_GDS <0x1c, "ds_gws_sema_p">; defm DS_GWS_BARRIER : DS_1A_GDS <0x1d, "ds_gws_barrier">; let mayLoad = 0 in { defm DS_WRITE_B8 : DS_1A1D_NORET <0x1e, "ds_write_b8", VGPR_32>; defm DS_WRITE_B16 : DS_1A1D_NORET <0x1f, "ds_write_b16", VGPR_32>; } defm DS_ADD_RTN_U32 : DS_1A1D_RET <0x20, "ds_add_rtn_u32", VGPR_32, "ds_add_u32">; defm DS_SUB_RTN_U32 : DS_1A1D_RET <0x21, "ds_sub_rtn_u32", VGPR_32, "ds_sub_u32">; defm DS_RSUB_RTN_U32 : DS_1A1D_RET <0x22, "ds_rsub_rtn_u32", VGPR_32, "ds_rsub_u32">; defm DS_INC_RTN_U32 : DS_1A1D_RET <0x23, "ds_inc_rtn_u32", VGPR_32, "ds_inc_u32">; defm DS_DEC_RTN_U32 : DS_1A1D_RET <0x24, "ds_dec_rtn_u32", VGPR_32, "ds_dec_u32">; defm DS_MIN_RTN_I32 : DS_1A1D_RET <0x25, "ds_min_rtn_i32", VGPR_32, "ds_min_i32">; defm DS_MAX_RTN_I32 : DS_1A1D_RET <0x26, "ds_max_rtn_i32", VGPR_32, "ds_max_i32">; defm DS_MIN_RTN_U32 : DS_1A1D_RET <0x27, "ds_min_rtn_u32", VGPR_32, "ds_min_u32">; defm DS_MAX_RTN_U32 : DS_1A1D_RET <0x28, "ds_max_rtn_u32", VGPR_32, "ds_max_u32">; defm DS_AND_RTN_B32 : DS_1A1D_RET <0x29, "ds_and_rtn_b32", VGPR_32, "ds_and_b32">; defm DS_OR_RTN_B32 : DS_1A1D_RET <0x2a, "ds_or_rtn_b32", VGPR_32, "ds_or_b32">; defm DS_XOR_RTN_B32 : DS_1A1D_RET <0x2b, "ds_xor_rtn_b32", VGPR_32, "ds_xor_b32">; defm DS_MSKOR_RTN_B32 : DS_1A2D_RET <0x2c, "ds_mskor_rtn_b32", VGPR_32, "ds_mskor_b32">; defm DS_WRXCHG_RTN_B32 : DS_1A1D_RET <0x2d, "ds_wrxchg_rtn_b32", VGPR_32>; defm DS_WRXCHG2_RTN_B32 : DS_1A2D_RET < 0x2e, "ds_wrxchg2_rtn_b32", VReg_64, "", VGPR_32 >; defm DS_WRXCHG2ST64_RTN_B32 : DS_1A2D_RET < 0x2f, "ds_wrxchg2st64_rtn_b32", VReg_64, "", VGPR_32 >; defm DS_CMPST_RTN_B32 : DS_1A2D_RET <0x30, "ds_cmpst_rtn_b32", VGPR_32, "ds_cmpst_b32">; defm DS_CMPST_RTN_F32 : DS_1A2D_RET <0x31, "ds_cmpst_rtn_f32", VGPR_32, "ds_cmpst_f32">; defm DS_MIN_RTN_F32 : DS_1A2D_RET <0x32, "ds_min_rtn_f32", VGPR_32, "ds_min_f32">; defm DS_MAX_RTN_F32 : DS_1A2D_RET <0x33, "ds_max_rtn_f32", VGPR_32, "ds_max_f32">; let Uses = [EXEC], mayLoad =0, mayStore = 0, isConvergent = 1 in { defm DS_SWIZZLE_B32 : DS_1A_RET_ , "ds_swizzle_b32", VGPR_32>; } let mayStore = 0 in { defm DS_READ_B32 : DS_1A_RET <0x36, "ds_read_b32", VGPR_32>; defm DS_READ2_B32 : DS_1A_Off8_RET <0x37, "ds_read2_b32", VReg_64>; defm DS_READ2ST64_B32 : DS_1A_Off8_RET <0x38, "ds_read2st64_b32", VReg_64>; defm DS_READ_I8 : DS_1A_RET <0x39, "ds_read_i8", VGPR_32>; defm DS_READ_U8 : DS_1A_RET <0x3a, "ds_read_u8", VGPR_32>; defm DS_READ_I16 : DS_1A_RET <0x3b, "ds_read_i16", VGPR_32>; defm DS_READ_U16 : DS_1A_RET <0x3c, "ds_read_u16", VGPR_32>; } defm DS_CONSUME : DS_0A_RET <0x3d, "ds_consume">; defm DS_APPEND : DS_0A_RET <0x3e, "ds_append">; defm DS_ORDERED_COUNT : DS_1A_RET_GDS <0x3f, "ds_ordered_count">; defm DS_ADD_U64 : DS_1A1D_NORET <0x40, "ds_add_u64", VReg_64>; defm DS_SUB_U64 : DS_1A1D_NORET <0x41, "ds_sub_u64", VReg_64>; defm DS_RSUB_U64 : DS_1A1D_NORET <0x42, "ds_rsub_u64", VReg_64>; defm DS_INC_U64 : DS_1A1D_NORET <0x43, "ds_inc_u64", VReg_64>; defm DS_DEC_U64 : DS_1A1D_NORET <0x44, "ds_dec_u64", VReg_64>; defm DS_MIN_I64 : DS_1A1D_NORET <0x45, "ds_min_i64", VReg_64>; defm DS_MAX_I64 : DS_1A1D_NORET <0x46, "ds_max_i64", VReg_64>; defm DS_MIN_U64 : DS_1A1D_NORET <0x47, "ds_min_u64", VReg_64>; defm DS_MAX_U64 : DS_1A1D_NORET <0x48, "ds_max_u64", VReg_64>; defm DS_AND_B64 : DS_1A1D_NORET <0x49, "ds_and_b64", VReg_64>; defm DS_OR_B64 : DS_1A1D_NORET <0x4a, "ds_or_b64", VReg_64>; defm DS_XOR_B64 : DS_1A1D_NORET <0x4b, "ds_xor_b64", VReg_64>; defm DS_MSKOR_B64 : DS_1A2D_NORET <0x4c, "ds_mskor_b64", VReg_64>; let mayLoad = 0 in { defm DS_WRITE_B64 : DS_1A1D_NORET <0x4d, "ds_write_b64", VReg_64>; defm DS_WRITE2_B64 : DS_1A2D_Off8_NORET <0x4E, "ds_write2_b64", VReg_64>; defm DS_WRITE2ST64_B64 : DS_1A2D_Off8_NORET <0x4f, "ds_write2st64_b64", VReg_64>; } defm DS_CMPST_B64 : DS_1A2D_NORET <0x50, "ds_cmpst_b64", VReg_64>; defm DS_CMPST_F64 : DS_1A2D_NORET <0x51, "ds_cmpst_f64", VReg_64>; defm DS_MIN_F64 : DS_1A1D_NORET <0x52, "ds_min_f64", VReg_64>; defm DS_MAX_F64 : DS_1A1D_NORET <0x53, "ds_max_f64", VReg_64>; defm DS_ADD_RTN_U64 : DS_1A1D_RET <0x60, "ds_add_rtn_u64", VReg_64, "ds_add_u64">; defm DS_SUB_RTN_U64 : DS_1A1D_RET <0x61, "ds_sub_rtn_u64", VReg_64, "ds_sub_u64">; defm DS_RSUB_RTN_U64 : DS_1A1D_RET <0x62, "ds_rsub_rtn_u64", VReg_64, "ds_rsub_u64">; defm DS_INC_RTN_U64 : DS_1A1D_RET <0x63, "ds_inc_rtn_u64", VReg_64, "ds_inc_u64">; defm DS_DEC_RTN_U64 : DS_1A1D_RET <0x64, "ds_dec_rtn_u64", VReg_64, "ds_dec_u64">; defm DS_MIN_RTN_I64 : DS_1A1D_RET <0x65, "ds_min_rtn_i64", VReg_64, "ds_min_i64">; defm DS_MAX_RTN_I64 : DS_1A1D_RET <0x66, "ds_max_rtn_i64", VReg_64, "ds_max_i64">; defm DS_MIN_RTN_U64 : DS_1A1D_RET <0x67, "ds_min_rtn_u64", VReg_64, "ds_min_u64">; defm DS_MAX_RTN_U64 : DS_1A1D_RET <0x68, "ds_max_rtn_u64", VReg_64, "ds_max_u64">; defm DS_AND_RTN_B64 : DS_1A1D_RET <0x69, "ds_and_rtn_b64", VReg_64, "ds_and_b64">; defm DS_OR_RTN_B64 : DS_1A1D_RET <0x6a, "ds_or_rtn_b64", VReg_64, "ds_or_b64">; defm DS_XOR_RTN_B64 : DS_1A1D_RET <0x6b, "ds_xor_rtn_b64", VReg_64, "ds_xor_b64">; defm DS_MSKOR_RTN_B64 : DS_1A2D_RET <0x6c, "ds_mskor_rtn_b64", VReg_64, "ds_mskor_b64">; defm DS_WRXCHG_RTN_B64 : DS_1A1D_RET <0x6d, "ds_wrxchg_rtn_b64", VReg_64, "ds_wrxchg_b64">; defm DS_WRXCHG2_RTN_B64 : DS_1A2D_RET <0x6e, "ds_wrxchg2_rtn_b64", VReg_128, "ds_wrxchg2_b64", VReg_64>; defm DS_WRXCHG2ST64_RTN_B64 : DS_1A2D_RET <0x6f, "ds_wrxchg2st64_rtn_b64", VReg_128, "ds_wrxchg2st64_b64", VReg_64>; defm DS_CMPST_RTN_B64 : DS_1A2D_RET <0x70, "ds_cmpst_rtn_b64", VReg_64, "ds_cmpst_b64">; defm DS_CMPST_RTN_F64 : DS_1A2D_RET <0x71, "ds_cmpst_rtn_f64", VReg_64, "ds_cmpst_f64">; defm DS_MIN_RTN_F64 : DS_1A1D_RET <0x72, "ds_min_rtn_f64", VReg_64, "ds_min_f64">; defm DS_MAX_RTN_F64 : DS_1A1D_RET <0x73, "ds_max_rtn_f64", VReg_64, "ds_max_f64">; let mayStore = 0 in { defm DS_READ_B64 : DS_1A_RET <0x76, "ds_read_b64", VReg_64>; defm DS_READ2_B64 : DS_1A_Off8_RET <0x77, "ds_read2_b64", VReg_128>; defm DS_READ2ST64_B64 : DS_1A_Off8_RET <0x78, "ds_read2st64_b64", VReg_128>; } defm DS_ADD_SRC2_U32 : DS_1A <0x80, "ds_add_src2_u32">; defm DS_SUB_SRC2_U32 : DS_1A <0x81, "ds_sub_src2_u32">; defm DS_RSUB_SRC2_U32 : DS_1A <0x82, "ds_rsub_src2_u32">; defm DS_INC_SRC2_U32 : DS_1A <0x83, "ds_inc_src2_u32">; defm DS_DEC_SRC2_U32 : DS_1A <0x84, "ds_dec_src2_u32">; defm DS_MIN_SRC2_I32 : DS_1A <0x85, "ds_min_src2_i32">; defm DS_MAX_SRC2_I32 : DS_1A <0x86, "ds_max_src2_i32">; defm DS_MIN_SRC2_U32 : DS_1A <0x87, "ds_min_src2_u32">; defm DS_MAX_SRC2_U32 : DS_1A <0x88, "ds_max_src2_u32">; defm DS_AND_SRC2_B32 : DS_1A <0x89, "ds_and_src_b32">; defm DS_OR_SRC2_B32 : DS_1A <0x8a, "ds_or_src2_b32">; defm DS_XOR_SRC2_B32 : DS_1A <0x8b, "ds_xor_src2_b32">; defm DS_WRITE_SRC2_B32 : DS_1A_Off8_NORET <0x8d, "ds_write_src2_b32">; defm DS_MIN_SRC2_F32 : DS_1A <0x92, "ds_min_src2_f32">; defm DS_MAX_SRC2_F32 : DS_1A <0x93, "ds_max_src2_f32">; defm DS_ADD_SRC2_U64 : DS_1A <0xc0, "ds_add_src2_u64">; defm DS_SUB_SRC2_U64 : DS_1A <0xc1, "ds_sub_src2_u64">; defm DS_RSUB_SRC2_U64 : DS_1A <0xc2, "ds_rsub_src2_u64">; defm DS_INC_SRC2_U64 : DS_1A <0xc3, "ds_inc_src2_u64">; defm DS_DEC_SRC2_U64 : DS_1A <0xc4, "ds_dec_src2_u64">; defm DS_MIN_SRC2_I64 : DS_1A <0xc5, "ds_min_src2_i64">; defm DS_MAX_SRC2_I64 : DS_1A <0xc6, "ds_max_src2_i64">; defm DS_MIN_SRC2_U64 : DS_1A <0xc7, "ds_min_src2_u64">; defm DS_MAX_SRC2_U64 : DS_1A <0xc8, "ds_max_src2_u64">; defm DS_AND_SRC2_B64 : DS_1A <0xc9, "ds_and_src2_b64">; defm DS_OR_SRC2_B64 : DS_1A <0xca, "ds_or_src2_b64">; defm DS_XOR_SRC2_B64 : DS_1A <0xcb, "ds_xor_src2_b64">; defm DS_WRITE_SRC2_B64 : DS_1A_Off8_NORET <0xcd, "ds_write_src2_b64">; defm DS_MIN_SRC2_F64 : DS_1A <0xd2, "ds_min_src2_f64">; defm DS_MAX_SRC2_F64 : DS_1A <0xd3, "ds_max_src2_f64">; //===----------------------------------------------------------------------===// // MUBUF Instructions //===----------------------------------------------------------------------===// defm BUFFER_LOAD_FORMAT_X : MUBUF_Load_Helper < mubuf<0x00>, "buffer_load_format_x", VGPR_32 >; defm BUFFER_LOAD_FORMAT_XY : MUBUF_Load_Helper < mubuf<0x01>, "buffer_load_format_xy", VReg_64 >; defm BUFFER_LOAD_FORMAT_XYZ : MUBUF_Load_Helper < mubuf<0x02>, "buffer_load_format_xyz", VReg_96 >; defm BUFFER_LOAD_FORMAT_XYZW : MUBUF_Load_Helper < mubuf<0x03>, "buffer_load_format_xyzw", VReg_128 >; defm BUFFER_STORE_FORMAT_X : MUBUF_Store_Helper < mubuf<0x04>, "buffer_store_format_x", VGPR_32 >; defm BUFFER_STORE_FORMAT_XY : MUBUF_Store_Helper < mubuf<0x05>, "buffer_store_format_xy", VReg_64 >; defm BUFFER_STORE_FORMAT_XYZ : MUBUF_Store_Helper < mubuf<0x06>, "buffer_store_format_xyz", VReg_96 >; defm BUFFER_STORE_FORMAT_XYZW : MUBUF_Store_Helper < mubuf<0x07>, "buffer_store_format_xyzw", VReg_128 >; defm BUFFER_LOAD_UBYTE : MUBUF_Load_Helper < mubuf<0x08, 0x10>, "buffer_load_ubyte", VGPR_32, i32, mubuf_az_extloadi8 >; defm BUFFER_LOAD_SBYTE : MUBUF_Load_Helper < mubuf<0x09, 0x11>, "buffer_load_sbyte", VGPR_32, i32, mubuf_sextloadi8 >; defm BUFFER_LOAD_USHORT : MUBUF_Load_Helper < mubuf<0x0a, 0x12>, "buffer_load_ushort", VGPR_32, i32, mubuf_az_extloadi16 >; defm BUFFER_LOAD_SSHORT : MUBUF_Load_Helper < mubuf<0x0b, 0x13>, "buffer_load_sshort", VGPR_32, i32, mubuf_sextloadi16 >; defm BUFFER_LOAD_DWORD : MUBUF_Load_Helper < mubuf<0x0c, 0x14>, "buffer_load_dword", VGPR_32, i32, mubuf_load >; defm BUFFER_LOAD_DWORDX2 : MUBUF_Load_Helper < mubuf<0x0d, 0x15>, "buffer_load_dwordx2", VReg_64, v2i32, mubuf_load >; defm BUFFER_LOAD_DWORDX4 : MUBUF_Load_Helper < mubuf<0x0e, 0x17>, "buffer_load_dwordx4", VReg_128, v4i32, mubuf_load >; defm BUFFER_STORE_BYTE : MUBUF_Store_Helper < mubuf<0x18>, "buffer_store_byte", VGPR_32, i32, truncstorei8_global >; defm BUFFER_STORE_SHORT : MUBUF_Store_Helper < mubuf<0x1a>, "buffer_store_short", VGPR_32, i32, truncstorei16_global >; defm BUFFER_STORE_DWORD : MUBUF_Store_Helper < mubuf<0x1c>, "buffer_store_dword", VGPR_32, i32, global_store >; defm BUFFER_STORE_DWORDX2 : MUBUF_Store_Helper < mubuf<0x1d>, "buffer_store_dwordx2", VReg_64, v2i32, global_store >; defm BUFFER_STORE_DWORDX4 : MUBUF_Store_Helper < mubuf<0x1e, 0x1f>, "buffer_store_dwordx4", VReg_128, v4i32, global_store >; defm BUFFER_ATOMIC_SWAP : MUBUF_Atomic < mubuf<0x30, 0x40>, "buffer_atomic_swap", VGPR_32, i32, atomic_swap_global >; defm BUFFER_ATOMIC_CMPSWAP : MUBUF_Atomic < mubuf<0x31, 0x41>, "buffer_atomic_cmpswap", VReg_64, v2i32, null_frag >; defm BUFFER_ATOMIC_ADD : MUBUF_Atomic < mubuf<0x32, 0x42>, "buffer_atomic_add", VGPR_32, i32, atomic_add_global >; defm BUFFER_ATOMIC_SUB : MUBUF_Atomic < mubuf<0x33, 0x43>, "buffer_atomic_sub", VGPR_32, i32, atomic_sub_global >; //def BUFFER_ATOMIC_RSUB : MUBUF_ , "buffer_atomic_rsub", []>; // isn't on CI & VI defm BUFFER_ATOMIC_SMIN : MUBUF_Atomic < mubuf<0x35, 0x44>, "buffer_atomic_smin", VGPR_32, i32, atomic_min_global >; defm BUFFER_ATOMIC_UMIN : MUBUF_Atomic < mubuf<0x36, 0x45>, "buffer_atomic_umin", VGPR_32, i32, atomic_umin_global >; defm BUFFER_ATOMIC_SMAX : MUBUF_Atomic < mubuf<0x37, 0x46>, "buffer_atomic_smax", VGPR_32, i32, atomic_max_global >; defm BUFFER_ATOMIC_UMAX : MUBUF_Atomic < mubuf<0x38, 0x47>, "buffer_atomic_umax", VGPR_32, i32, atomic_umax_global >; defm BUFFER_ATOMIC_AND : MUBUF_Atomic < mubuf<0x39, 0x48>, "buffer_atomic_and", VGPR_32, i32, atomic_and_global >; defm BUFFER_ATOMIC_OR : MUBUF_Atomic < mubuf<0x3a, 0x49>, "buffer_atomic_or", VGPR_32, i32, atomic_or_global >; defm BUFFER_ATOMIC_XOR : MUBUF_Atomic < mubuf<0x3b, 0x4a>, "buffer_atomic_xor", VGPR_32, i32, atomic_xor_global >; defm BUFFER_ATOMIC_INC : MUBUF_Atomic < mubuf<0x3c, 0x4b>, "buffer_atomic_inc", VGPR_32, i32, atomic_inc_global >; defm BUFFER_ATOMIC_DEC : MUBUF_Atomic < mubuf<0x3d, 0x4c>, "buffer_atomic_dec", VGPR_32, i32, atomic_dec_global >; //def BUFFER_ATOMIC_FCMPSWAP : MUBUF_Atomic , "buffer_atomic_fcmpswap", []>; // isn't on VI //def BUFFER_ATOMIC_FMIN : MUBUF_Atomic , "buffer_atomic_fmin", []>; // isn't on VI //def BUFFER_ATOMIC_FMAX : MUBUF_Atomic , "buffer_atomic_fmax", []>; // isn't on VI defm BUFFER_ATOMIC_SWAP_X2 : MUBUF_Atomic < mubuf<0x50, 0x60>, "buffer_atomic_swap_x2", VReg_64, i64, atomic_swap_global >; defm BUFFER_ATOMIC_CMPSWAP_X2 : MUBUF_Atomic < mubuf<0x51, 0x61>, "buffer_atomic_cmpswap_x2", VReg_128, v2i64, null_frag >; defm BUFFER_ATOMIC_ADD_X2 : MUBUF_Atomic < mubuf<0x52, 0x62>, "buffer_atomic_add_x2", VReg_64, i64, atomic_add_global >; defm BUFFER_ATOMIC_SUB_X2 : MUBUF_Atomic < mubuf<0x53, 0x63>, "buffer_atomic_sub_x2", VReg_64, i64, atomic_sub_global >; //defm BUFFER_ATOMIC_RSUB_X2 : MUBUF_Atomic , "buffer_atomic_rsub_x2", []>; // isn't on CI & VI defm BUFFER_ATOMIC_SMIN_X2 : MUBUF_Atomic < mubuf<0x55, 0x64>, "buffer_atomic_smin_x2", VReg_64, i64, atomic_min_global >; defm BUFFER_ATOMIC_UMIN_X2 : MUBUF_Atomic < mubuf<0x56, 0x65>, "buffer_atomic_umin_x2", VReg_64, i64, atomic_umin_global >; defm BUFFER_ATOMIC_SMAX_X2 : MUBUF_Atomic < mubuf<0x57, 0x66>, "buffer_atomic_smax_x2", VReg_64, i64, atomic_max_global >; defm BUFFER_ATOMIC_UMAX_X2 : MUBUF_Atomic < mubuf<0x58, 0x67>, "buffer_atomic_umax_x2", VReg_64, i64, atomic_umax_global >; defm BUFFER_ATOMIC_AND_X2 : MUBUF_Atomic < mubuf<0x59, 0x68>, "buffer_atomic_and_x2", VReg_64, i64, atomic_and_global >; defm BUFFER_ATOMIC_OR_X2 : MUBUF_Atomic < mubuf<0x5a, 0x69>, "buffer_atomic_or_x2", VReg_64, i64, atomic_or_global >; defm BUFFER_ATOMIC_XOR_X2 : MUBUF_Atomic < mubuf<0x5b, 0x6a>, "buffer_atomic_xor_x2", VReg_64, i64, atomic_xor_global >; defm BUFFER_ATOMIC_INC_X2 : MUBUF_Atomic < mubuf<0x5c, 0x6b>, "buffer_atomic_inc_x2", VReg_64, i64, atomic_inc_global >; defm BUFFER_ATOMIC_DEC_X2 : MUBUF_Atomic < mubuf<0x5d, 0x6c>, "buffer_atomic_dec_x2", VReg_64, i64, atomic_dec_global >; //def BUFFER_ATOMIC_FCMPSWAP_X2 : MUBUF_X2 , "buffer_atomic_fcmpswap_x2", []>; // isn't on VI //def BUFFER_ATOMIC_FMIN_X2 : MUBUF_X2 , "buffer_atomic_fmin_x2", []>; // isn't on VI //def BUFFER_ATOMIC_FMAX_X2 : MUBUF_X2 , "buffer_atomic_fmax_x2", []>; // isn't on VI let SubtargetPredicate = isSI, DisableVIDecoder = 1 in { defm BUFFER_WBINVL1_SC : MUBUF_Invalidate , "buffer_wbinvl1_sc", int_amdgcn_buffer_wbinvl1_sc>; // isn't on CI & VI } defm BUFFER_WBINVL1 : MUBUF_Invalidate , "buffer_wbinvl1", int_amdgcn_buffer_wbinvl1>; //===----------------------------------------------------------------------===// // MTBUF Instructions //===----------------------------------------------------------------------===// //def TBUFFER_LOAD_FORMAT_X : MTBUF_ <0x00000000, "tbuffer_load_format_x", []>; //def TBUFFER_LOAD_FORMAT_XY : MTBUF_ <0x00000001, "tbuffer_load_format_xy", []>; //def TBUFFER_LOAD_FORMAT_XYZ : MTBUF_ <0x00000002, "tbuffer_load_format_xyz", []>; defm TBUFFER_LOAD_FORMAT_XYZW : MTBUF_Load_Helper <0x00000003, "tbuffer_load_format_xyzw", VReg_128>; defm TBUFFER_STORE_FORMAT_X : MTBUF_Store_Helper <0x00000004, "tbuffer_store_format_x", VGPR_32>; defm TBUFFER_STORE_FORMAT_XY : MTBUF_Store_Helper <0x00000005, "tbuffer_store_format_xy", VReg_64>; defm TBUFFER_STORE_FORMAT_XYZ : MTBUF_Store_Helper <0x00000006, "tbuffer_store_format_xyz", VReg_128>; defm TBUFFER_STORE_FORMAT_XYZW : MTBUF_Store_Helper <0x00000007, "tbuffer_store_format_xyzw", VReg_128>; //===----------------------------------------------------------------------===// // MIMG Instructions //===----------------------------------------------------------------------===// defm IMAGE_LOAD : MIMG_NoSampler <0x00000000, "image_load">; defm IMAGE_LOAD_MIP : MIMG_NoSampler <0x00000001, "image_load_mip">; //def IMAGE_LOAD_PCK : MIMG_NoPattern_ <"image_load_pck", 0x00000002>; //def IMAGE_LOAD_PCK_SGN : MIMG_NoPattern_ <"image_load_pck_sgn", 0x00000003>; //def IMAGE_LOAD_MIP_PCK : MIMG_NoPattern_ <"image_load_mip_pck", 0x00000004>; //def IMAGE_LOAD_MIP_PCK_SGN : MIMG_NoPattern_ <"image_load_mip_pck_sgn", 0x00000005>; defm IMAGE_STORE : MIMG_Store <0x00000008, "image_store">; defm IMAGE_STORE_MIP : MIMG_Store <0x00000009, "image_store_mip">; //def IMAGE_STORE_PCK : MIMG_NoPattern_ <"image_store_pck", 0x0000000a>; //def IMAGE_STORE_MIP_PCK : MIMG_NoPattern_ <"image_store_mip_pck", 0x0000000b>; defm IMAGE_GET_RESINFO : MIMG_NoSampler <0x0000000e, "image_get_resinfo">; defm IMAGE_ATOMIC_SWAP : MIMG_Atomic , "image_atomic_swap">; defm IMAGE_ATOMIC_CMPSWAP : MIMG_Atomic , "image_atomic_cmpswap", VReg_64>; defm IMAGE_ATOMIC_ADD : MIMG_Atomic , "image_atomic_add">; defm IMAGE_ATOMIC_SUB : MIMG_Atomic , "image_atomic_sub">; //def IMAGE_ATOMIC_RSUB : MIMG_NoPattern_ <"image_atomic_rsub", 0x00000013>; -- not on VI defm IMAGE_ATOMIC_SMIN : MIMG_Atomic , "image_atomic_smin">; defm IMAGE_ATOMIC_UMIN : MIMG_Atomic , "image_atomic_umin">; defm IMAGE_ATOMIC_SMAX : MIMG_Atomic , "image_atomic_smax">; defm IMAGE_ATOMIC_UMAX : MIMG_Atomic , "image_atomic_umax">; defm IMAGE_ATOMIC_AND : MIMG_Atomic , "image_atomic_and">; defm IMAGE_ATOMIC_OR : MIMG_Atomic , "image_atomic_or">; defm IMAGE_ATOMIC_XOR : MIMG_Atomic , "image_atomic_xor">; defm IMAGE_ATOMIC_INC : MIMG_Atomic , "image_atomic_inc">; defm IMAGE_ATOMIC_DEC : MIMG_Atomic , "image_atomic_dec">; //def IMAGE_ATOMIC_FCMPSWAP : MIMG_NoPattern_ <"image_atomic_fcmpswap", 0x0000001d>; -- not on VI //def IMAGE_ATOMIC_FMIN : MIMG_NoPattern_ <"image_atomic_fmin", 0x0000001e>; -- not on VI //def IMAGE_ATOMIC_FMAX : MIMG_NoPattern_ <"image_atomic_fmax", 0x0000001f>; -- not on VI defm IMAGE_SAMPLE : MIMG_Sampler_WQM <0x00000020, "image_sample">; defm IMAGE_SAMPLE_CL : MIMG_Sampler_WQM <0x00000021, "image_sample_cl">; defm IMAGE_SAMPLE_D : MIMG_Sampler <0x00000022, "image_sample_d">; defm IMAGE_SAMPLE_D_CL : MIMG_Sampler <0x00000023, "image_sample_d_cl">; defm IMAGE_SAMPLE_L : MIMG_Sampler <0x00000024, "image_sample_l">; defm IMAGE_SAMPLE_B : MIMG_Sampler_WQM <0x00000025, "image_sample_b">; defm IMAGE_SAMPLE_B_CL : MIMG_Sampler_WQM <0x00000026, "image_sample_b_cl">; defm IMAGE_SAMPLE_LZ : MIMG_Sampler <0x00000027, "image_sample_lz">; defm IMAGE_SAMPLE_C : MIMG_Sampler_WQM <0x00000028, "image_sample_c">; defm IMAGE_SAMPLE_C_CL : MIMG_Sampler_WQM <0x00000029, "image_sample_c_cl">; defm IMAGE_SAMPLE_C_D : MIMG_Sampler <0x0000002a, "image_sample_c_d">; defm IMAGE_SAMPLE_C_D_CL : MIMG_Sampler <0x0000002b, "image_sample_c_d_cl">; defm IMAGE_SAMPLE_C_L : MIMG_Sampler <0x0000002c, "image_sample_c_l">; defm IMAGE_SAMPLE_C_B : MIMG_Sampler_WQM <0x0000002d, "image_sample_c_b">; defm IMAGE_SAMPLE_C_B_CL : MIMG_Sampler_WQM <0x0000002e, "image_sample_c_b_cl">; defm IMAGE_SAMPLE_C_LZ : MIMG_Sampler <0x0000002f, "image_sample_c_lz">; defm IMAGE_SAMPLE_O : MIMG_Sampler_WQM <0x00000030, "image_sample_o">; defm IMAGE_SAMPLE_CL_O : MIMG_Sampler_WQM <0x00000031, "image_sample_cl_o">; defm IMAGE_SAMPLE_D_O : MIMG_Sampler <0x00000032, "image_sample_d_o">; defm IMAGE_SAMPLE_D_CL_O : MIMG_Sampler <0x00000033, "image_sample_d_cl_o">; defm IMAGE_SAMPLE_L_O : MIMG_Sampler <0x00000034, "image_sample_l_o">; defm IMAGE_SAMPLE_B_O : MIMG_Sampler_WQM <0x00000035, "image_sample_b_o">; defm IMAGE_SAMPLE_B_CL_O : MIMG_Sampler_WQM <0x00000036, "image_sample_b_cl_o">; defm IMAGE_SAMPLE_LZ_O : MIMG_Sampler <0x00000037, "image_sample_lz_o">; defm IMAGE_SAMPLE_C_O : MIMG_Sampler_WQM <0x00000038, "image_sample_c_o">; defm IMAGE_SAMPLE_C_CL_O : MIMG_Sampler_WQM <0x00000039, "image_sample_c_cl_o">; defm IMAGE_SAMPLE_C_D_O : MIMG_Sampler <0x0000003a, "image_sample_c_d_o">; defm IMAGE_SAMPLE_C_D_CL_O : MIMG_Sampler <0x0000003b, "image_sample_c_d_cl_o">; defm IMAGE_SAMPLE_C_L_O : MIMG_Sampler <0x0000003c, "image_sample_c_l_o">; defm IMAGE_SAMPLE_C_B_O : MIMG_Sampler_WQM <0x0000003d, "image_sample_c_b_o">; defm IMAGE_SAMPLE_C_B_CL_O : MIMG_Sampler_WQM <0x0000003e, "image_sample_c_b_cl_o">; defm IMAGE_SAMPLE_C_LZ_O : MIMG_Sampler <0x0000003f, "image_sample_c_lz_o">; defm IMAGE_GATHER4 : MIMG_Gather_WQM <0x00000040, "image_gather4">; defm IMAGE_GATHER4_CL : MIMG_Gather_WQM <0x00000041, "image_gather4_cl">; defm IMAGE_GATHER4_L : MIMG_Gather <0x00000044, "image_gather4_l">; defm IMAGE_GATHER4_B : MIMG_Gather_WQM <0x00000045, "image_gather4_b">; defm IMAGE_GATHER4_B_CL : MIMG_Gather_WQM <0x00000046, "image_gather4_b_cl">; defm IMAGE_GATHER4_LZ : MIMG_Gather <0x00000047, "image_gather4_lz">; defm IMAGE_GATHER4_C : MIMG_Gather_WQM <0x00000048, "image_gather4_c">; defm IMAGE_GATHER4_C_CL : MIMG_Gather_WQM <0x00000049, "image_gather4_c_cl">; defm IMAGE_GATHER4_C_L : MIMG_Gather <0x0000004c, "image_gather4_c_l">; defm IMAGE_GATHER4_C_B : MIMG_Gather_WQM <0x0000004d, "image_gather4_c_b">; defm IMAGE_GATHER4_C_B_CL : MIMG_Gather_WQM <0x0000004e, "image_gather4_c_b_cl">; defm IMAGE_GATHER4_C_LZ : MIMG_Gather <0x0000004f, "image_gather4_c_lz">; defm IMAGE_GATHER4_O : MIMG_Gather_WQM <0x00000050, "image_gather4_o">; defm IMAGE_GATHER4_CL_O : MIMG_Gather_WQM <0x00000051, "image_gather4_cl_o">; defm IMAGE_GATHER4_L_O : MIMG_Gather <0x00000054, "image_gather4_l_o">; defm IMAGE_GATHER4_B_O : MIMG_Gather_WQM <0x00000055, "image_gather4_b_o">; defm IMAGE_GATHER4_B_CL_O : MIMG_Gather <0x00000056, "image_gather4_b_cl_o">; defm IMAGE_GATHER4_LZ_O : MIMG_Gather <0x00000057, "image_gather4_lz_o">; defm IMAGE_GATHER4_C_O : MIMG_Gather_WQM <0x00000058, "image_gather4_c_o">; defm IMAGE_GATHER4_C_CL_O : MIMG_Gather_WQM <0x00000059, "image_gather4_c_cl_o">; defm IMAGE_GATHER4_C_L_O : MIMG_Gather <0x0000005c, "image_gather4_c_l_o">; defm IMAGE_GATHER4_C_B_O : MIMG_Gather_WQM <0x0000005d, "image_gather4_c_b_o">; defm IMAGE_GATHER4_C_B_CL_O : MIMG_Gather_WQM <0x0000005e, "image_gather4_c_b_cl_o">; defm IMAGE_GATHER4_C_LZ_O : MIMG_Gather <0x0000005f, "image_gather4_c_lz_o">; defm IMAGE_GET_LOD : MIMG_Sampler_WQM <0x00000060, "image_get_lod">; defm IMAGE_SAMPLE_CD : MIMG_Sampler <0x00000068, "image_sample_cd">; defm IMAGE_SAMPLE_CD_CL : MIMG_Sampler <0x00000069, "image_sample_cd_cl">; defm IMAGE_SAMPLE_C_CD : MIMG_Sampler <0x0000006a, "image_sample_c_cd">; defm IMAGE_SAMPLE_C_CD_CL : MIMG_Sampler <0x0000006b, "image_sample_c_cd_cl">; defm IMAGE_SAMPLE_CD_O : MIMG_Sampler <0x0000006c, "image_sample_cd_o">; defm IMAGE_SAMPLE_CD_CL_O : MIMG_Sampler <0x0000006d, "image_sample_cd_cl_o">; defm IMAGE_SAMPLE_C_CD_O : MIMG_Sampler <0x0000006e, "image_sample_c_cd_o">; defm IMAGE_SAMPLE_C_CD_CL_O : MIMG_Sampler <0x0000006f, "image_sample_c_cd_cl_o">; //def IMAGE_RSRC256 : MIMG_NoPattern_RSRC256 <"image_rsrc256", 0x0000007e>; //def IMAGE_SAMPLER : MIMG_NoPattern_ <"image_sampler", 0x0000007f>; //===----------------------------------------------------------------------===// // VOP1 Instructions //===----------------------------------------------------------------------===// let vdst = 0, src0 = 0, VOPAsmPrefer32Bit = 1 in { defm V_NOP : VOP1Inst , "v_nop", VOP_NONE>; } let isMoveImm = 1, isReMaterializable = 1, isAsCheapAsAMove = 1 in { defm V_MOV_B32 : VOP1Inst , "v_mov_b32", VOP_I32_I32>; } // End isMoveImm = 1 let Uses = [EXEC] in { // FIXME: Specify SchedRW for READFIRSTLANE_B32 def V_READFIRSTLANE_B32 : VOP1 < 0x00000002, (outs SReg_32:$vdst), (ins VS_32:$src0), "v_readfirstlane_b32 $vdst, $src0", [] > { let isConvergent = 1; } } let SchedRW = [WriteQuarterRate32] in { defm V_CVT_I32_F64 : VOP1Inst , "v_cvt_i32_f64", VOP_I32_F64, fp_to_sint >; defm V_CVT_F64_I32 : VOP1Inst , "v_cvt_f64_i32", VOP_F64_I32, sint_to_fp >; defm V_CVT_F32_I32 : VOP1Inst , "v_cvt_f32_i32", VOP_F32_I32, sint_to_fp >; defm V_CVT_F32_U32 : VOP1Inst , "v_cvt_f32_u32", VOP_F32_I32, uint_to_fp >; defm V_CVT_U32_F32 : VOP1Inst , "v_cvt_u32_f32", VOP_I32_F32, fp_to_uint >; defm V_CVT_I32_F32 : VOP1Inst , "v_cvt_i32_f32", VOP_I32_F32, fp_to_sint >; defm V_CVT_F16_F32 : VOP1Inst , "v_cvt_f16_f32", VOP_I32_F32, fp_to_f16 >; defm V_CVT_F32_F16 : VOP1Inst , "v_cvt_f32_f16", VOP_F32_I32, f16_to_fp >; defm V_CVT_RPI_I32_F32 : VOP1Inst , "v_cvt_rpi_i32_f32", VOP_I32_F32, cvt_rpi_i32_f32>; defm V_CVT_FLR_I32_F32 : VOP1Inst , "v_cvt_flr_i32_f32", VOP_I32_F32, cvt_flr_i32_f32>; defm V_CVT_OFF_F32_I4 : VOP1Inst , "v_cvt_off_f32_i4", VOP_F32_I32>; defm V_CVT_F32_F64 : VOP1Inst , "v_cvt_f32_f64", VOP_F32_F64, fround >; defm V_CVT_F64_F32 : VOP1Inst , "v_cvt_f64_f32", VOP_F64_F32, fextend >; defm V_CVT_F32_UBYTE0 : VOP1Inst , "v_cvt_f32_ubyte0", VOP_F32_I32, AMDGPUcvt_f32_ubyte0 >; defm V_CVT_F32_UBYTE1 : VOP1Inst , "v_cvt_f32_ubyte1", VOP_F32_I32, AMDGPUcvt_f32_ubyte1 >; defm V_CVT_F32_UBYTE2 : VOP1Inst , "v_cvt_f32_ubyte2", VOP_F32_I32, AMDGPUcvt_f32_ubyte2 >; defm V_CVT_F32_UBYTE3 : VOP1Inst , "v_cvt_f32_ubyte3", VOP_F32_I32, AMDGPUcvt_f32_ubyte3 >; defm V_CVT_U32_F64 : VOP1Inst , "v_cvt_u32_f64", VOP_I32_F64, fp_to_uint >; defm V_CVT_F64_U32 : VOP1Inst , "v_cvt_f64_u32", VOP_F64_I32, uint_to_fp >; } // End SchedRW = [WriteQuarterRate32] defm V_FRACT_F32 : VOP1Inst , "v_fract_f32", VOP_F32_F32, AMDGPUfract >; defm V_TRUNC_F32 : VOP1Inst , "v_trunc_f32", VOP_F32_F32, ftrunc >; defm V_CEIL_F32 : VOP1Inst , "v_ceil_f32", VOP_F32_F32, fceil >; defm V_RNDNE_F32 : VOP1Inst , "v_rndne_f32", VOP_F32_F32, frint >; defm V_FLOOR_F32 : VOP1Inst , "v_floor_f32", VOP_F32_F32, ffloor >; defm V_EXP_F32 : VOP1Inst , "v_exp_f32", VOP_F32_F32, fexp2 >; let SchedRW = [WriteQuarterRate32] in { defm V_LOG_F32 : VOP1Inst , "v_log_f32", VOP_F32_F32, flog2 >; defm V_RCP_F32 : VOP1Inst , "v_rcp_f32", VOP_F32_F32, AMDGPUrcp >; defm V_RCP_IFLAG_F32 : VOP1Inst , "v_rcp_iflag_f32", VOP_F32_F32 >; defm V_RSQ_F32 : VOP1Inst , "v_rsq_f32", VOP_F32_F32, AMDGPUrsq >; } // End SchedRW = [WriteQuarterRate32] let SchedRW = [WriteDouble] in { defm V_RCP_F64 : VOP1Inst , "v_rcp_f64", VOP_F64_F64, AMDGPUrcp >; defm V_RSQ_F64 : VOP1Inst , "v_rsq_f64", VOP_F64_F64, AMDGPUrsq >; } // End SchedRW = [WriteDouble]; defm V_SQRT_F32 : VOP1Inst , "v_sqrt_f32", VOP_F32_F32, fsqrt >; let SchedRW = [WriteDouble] in { defm V_SQRT_F64 : VOP1Inst , "v_sqrt_f64", VOP_F64_F64, fsqrt >; } // End SchedRW = [WriteDouble] let SchedRW = [WriteQuarterRate32] in { defm V_SIN_F32 : VOP1Inst , "v_sin_f32", VOP_F32_F32, AMDGPUsin >; defm V_COS_F32 : VOP1Inst , "v_cos_f32", VOP_F32_F32, AMDGPUcos >; } // End SchedRW = [WriteQuarterRate32] defm V_NOT_B32 : VOP1Inst , "v_not_b32", VOP_I32_I32>; defm V_BFREV_B32 : VOP1Inst , "v_bfrev_b32", VOP_I32_I32>; defm V_FFBH_U32 : VOP1Inst , "v_ffbh_u32", VOP_I32_I32>; defm V_FFBL_B32 : VOP1Inst , "v_ffbl_b32", VOP_I32_I32>; defm V_FFBH_I32 : VOP1Inst , "v_ffbh_i32", VOP_I32_I32>; defm V_FREXP_EXP_I32_F64 : VOP1Inst , "v_frexp_exp_i32_f64", VOP_I32_F64, int_amdgcn_frexp_exp >; let SchedRW = [WriteDoubleAdd] in { defm V_FREXP_MANT_F64 : VOP1Inst , "v_frexp_mant_f64", VOP_F64_F64, int_amdgcn_frexp_mant >; defm V_FRACT_F64 : VOP1Inst , "v_fract_f64", VOP_F64_F64, AMDGPUfract >; } // End SchedRW = [WriteDoubleAdd] defm V_FREXP_EXP_I32_F32 : VOP1Inst , "v_frexp_exp_i32_f32", VOP_I32_F32, int_amdgcn_frexp_exp >; defm V_FREXP_MANT_F32 : VOP1Inst , "v_frexp_mant_f32", VOP_F32_F32, int_amdgcn_frexp_mant >; let vdst = 0, src0 = 0, VOPAsmPrefer32Bit = 1 in { defm V_CLREXCP : VOP1Inst , "v_clrexcp", VOP_NO_EXT>; } let Uses = [M0, EXEC] in { defm V_MOVRELD_B32 : VOP1Inst , "v_movreld_b32", VOP_NO_EXT>; defm V_MOVRELS_B32 : VOP1Inst , "v_movrels_b32", VOP_NO_EXT>; defm V_MOVRELSD_B32 : VOP1Inst , "v_movrelsd_b32", VOP_NO_EXT>; } // End Uses = [M0, EXEC] // These instruction only exist on SI and CI let SubtargetPredicate = isSICI in { let SchedRW = [WriteQuarterRate32] in { defm V_MOV_FED_B32 : VOP1InstSI , "v_mov_fed_b32", VOP_I32_I32>; defm V_LOG_CLAMP_F32 : VOP1InstSI , "v_log_clamp_f32", VOP_F32_F32, int_amdgcn_log_clamp>; defm V_RCP_CLAMP_F32 : VOP1InstSI , "v_rcp_clamp_f32", VOP_F32_F32>; defm V_RCP_LEGACY_F32 : VOP1InstSI , "v_rcp_legacy_f32", VOP_F32_F32>; defm V_RSQ_CLAMP_F32 : VOP1InstSI , "v_rsq_clamp_f32", VOP_F32_F32, AMDGPUrsq_clamp >; defm V_RSQ_LEGACY_F32 : VOP1InstSI , "v_rsq_legacy_f32", VOP_F32_F32, AMDGPUrsq_legacy >; } // End SchedRW = [WriteQuarterRate32] let SchedRW = [WriteDouble] in { defm V_RCP_CLAMP_F64 : VOP1InstSI , "v_rcp_clamp_f64", VOP_F64_F64>; defm V_RSQ_CLAMP_F64 : VOP1InstSI , "v_rsq_clamp_f64", VOP_F64_F64, AMDGPUrsq_clamp >; } // End SchedRW = [WriteDouble] } // End SubtargetPredicate = isSICI //===----------------------------------------------------------------------===// // VINTRP Instructions //===----------------------------------------------------------------------===// let Uses = [M0, EXEC] in { // FIXME: Specify SchedRW for VINTRP insturctions. multiclass V_INTERP_P1_F32_m : VINTRP_m < 0x00000000, (outs VGPR_32:$dst), (ins VGPR_32:$i, i32imm:$attr_chan, i32imm:$attr), "v_interp_p1_f32 $dst, $i, $attr_chan, $attr, [m0]", [(set f32:$dst, (AMDGPUinterp_p1 i32:$i, (i32 imm:$attr_chan), (i32 imm:$attr)))] >; let OtherPredicates = [has32BankLDS] in { defm V_INTERP_P1_F32 : V_INTERP_P1_F32_m; } // End OtherPredicates = [has32BankLDS] let OtherPredicates = [has16BankLDS], Constraints = "@earlyclobber $dst", isAsmParserOnly=1 in { defm V_INTERP_P1_F32_16bank : V_INTERP_P1_F32_m; } // End OtherPredicates = [has32BankLDS], Constraints = "@earlyclobber $dst", isAsmParserOnly=1 let DisableEncoding = "$src0", Constraints = "$src0 = $dst" in { defm V_INTERP_P2_F32 : VINTRP_m < 0x00000001, (outs VGPR_32:$dst), (ins VGPR_32:$src0, VGPR_32:$j, i32imm:$attr_chan, i32imm:$attr), "v_interp_p2_f32 $dst, [$src0], $j, $attr_chan, $attr, [m0]", [(set f32:$dst, (AMDGPUinterp_p2 f32:$src0, i32:$j, (i32 imm:$attr_chan), (i32 imm:$attr)))]>; } // End DisableEncoding = "$src0", Constraints = "$src0 = $dst" defm V_INTERP_MOV_F32 : VINTRP_m < 0x00000002, (outs VGPR_32:$dst), (ins InterpSlot:$src0, i32imm:$attr_chan, i32imm:$attr), "v_interp_mov_f32 $dst, $src0, $attr_chan, $attr, [m0]", [(set f32:$dst, (AMDGPUinterp_mov (i32 imm:$src0), (i32 imm:$attr_chan), (i32 imm:$attr)))]>; } // End Uses = [M0, EXEC] //===----------------------------------------------------------------------===// // VOP2 Instructions //===----------------------------------------------------------------------===// defm V_CNDMASK_B32 : VOP2eInst , "v_cndmask_b32", VOP2e_I32_I32_I32_I1 >; let isCommutable = 1 in { defm V_ADD_F32 : VOP2Inst , "v_add_f32", VOP_F32_F32_F32, fadd >; defm V_SUB_F32 : VOP2Inst , "v_sub_f32", VOP_F32_F32_F32, fsub>; defm V_SUBREV_F32 : VOP2Inst , "v_subrev_f32", VOP_F32_F32_F32, null_frag, "v_sub_f32" >; } // End isCommutable = 1 let isCommutable = 1 in { defm V_MUL_LEGACY_F32 : VOP2Inst , "v_mul_legacy_f32", VOP_F32_F32_F32 >; defm V_MUL_F32 : VOP2Inst , "v_mul_f32", VOP_F32_F32_F32, fmul >; defm V_MUL_I32_I24 : VOP2Inst , "v_mul_i32_i24", VOP_I32_I32_I32, AMDGPUmul_i24 >; defm V_MUL_HI_I32_I24 : VOP2Inst , "v_mul_hi_i32_i24", VOP_I32_I32_I32 >; defm V_MUL_U32_U24 : VOP2Inst , "v_mul_u32_u24", VOP_I32_I32_I32, AMDGPUmul_u24 >; defm V_MUL_HI_U32_U24 : VOP2Inst , "v_mul_hi_u32_u24", VOP_I32_I32_I32 >; defm V_MIN_F32 : VOP2Inst , "v_min_f32", VOP_F32_F32_F32, fminnum>; defm V_MAX_F32 : VOP2Inst , "v_max_f32", VOP_F32_F32_F32, fmaxnum>; defm V_MIN_I32 : VOP2Inst , "v_min_i32", VOP_I32_I32_I32>; defm V_MAX_I32 : VOP2Inst , "v_max_i32", VOP_I32_I32_I32>; defm V_MIN_U32 : VOP2Inst , "v_min_u32", VOP_I32_I32_I32>; defm V_MAX_U32 : VOP2Inst , "v_max_u32", VOP_I32_I32_I32>; defm V_LSHRREV_B32 : VOP2Inst < vop2<0x16, 0x10>, "v_lshrrev_b32", VOP_I32_I32_I32, null_frag, "v_lshr_b32" >; defm V_ASHRREV_I32 : VOP2Inst < vop2<0x18, 0x11>, "v_ashrrev_i32", VOP_I32_I32_I32, null_frag, "v_ashr_i32" >; defm V_LSHLREV_B32 : VOP2Inst < vop2<0x1a, 0x12>, "v_lshlrev_b32", VOP_I32_I32_I32, null_frag, "v_lshl_b32" >; defm V_AND_B32 : VOP2Inst , "v_and_b32", VOP_I32_I32_I32>; defm V_OR_B32 : VOP2Inst , "v_or_b32", VOP_I32_I32_I32>; defm V_XOR_B32 : VOP2Inst , "v_xor_b32", VOP_I32_I32_I32>; let Constraints = "$vdst = $src2", DisableEncoding="$src2", isConvertibleToThreeAddress = 1 in { defm V_MAC_F32 : VOP2Inst , "v_mac_f32", VOP_MAC>; } } // End isCommutable = 1 defm V_MADMK_F32 : VOP2MADK , "v_madmk_f32", VOP_MADMK>; let isCommutable = 1 in { defm V_MADAK_F32 : VOP2MADK , "v_madak_f32", VOP_MADAK>; } // End isCommutable = 1 let isCommutable = 1 in { // No patterns so that the scalar instructions are always selected. // The scalar versions will be replaced with vector when needed later. // V_ADD_I32, V_SUB_I32, and V_SUBREV_I32 where renamed to *_U32 in VI, // but the VI instructions behave the same as the SI versions. defm V_ADD_I32 : VOP2bInst , "v_add_i32", VOP2b_I32_I1_I32_I32 >; defm V_SUB_I32 : VOP2bInst , "v_sub_i32", VOP2b_I32_I1_I32_I32>; defm V_SUBREV_I32 : VOP2bInst , "v_subrev_i32", VOP2b_I32_I1_I32_I32, null_frag, "v_sub_i32" >; defm V_ADDC_U32 : VOP2bInst , "v_addc_u32", VOP2b_I32_I1_I32_I32_I1 >; defm V_SUBB_U32 : VOP2bInst , "v_subb_u32", VOP2b_I32_I1_I32_I32_I1 >; defm V_SUBBREV_U32 : VOP2bInst , "v_subbrev_u32", VOP2b_I32_I1_I32_I32_I1, null_frag, "v_subb_u32" >; } // End isCommutable = 1 // These are special and do not read the exec mask. let isConvergent = 1, Uses = [] in { defm V_READLANE_B32 : VOP2SI_3VI_m < vop3 <0x001, 0x289>, "v_readlane_b32", (outs SReg_32:$vdst), (ins VS_32:$src0, SCSrc_32:$src1), "v_readlane_b32 $vdst, $src0, $src1" >; defm V_WRITELANE_B32 : VOP2SI_3VI_m < vop3 <0x002, 0x28a>, "v_writelane_b32", (outs VGPR_32:$vdst), (ins SReg_32:$src0, SCSrc_32:$src1), "v_writelane_b32 $vdst, $src0, $src1" >; } // End isConvergent = 1 // These instructions only exist on SI and CI let SubtargetPredicate = isSICI in { let isCommutable = 1 in { defm V_MAC_LEGACY_F32 : VOP2InstSI , "v_mac_legacy_f32", VOP_F32_F32_F32 >; } // End isCommutable = 1 defm V_MIN_LEGACY_F32 : VOP2InstSI , "v_min_legacy_f32", VOP_F32_F32_F32, AMDGPUfmin_legacy >; defm V_MAX_LEGACY_F32 : VOP2InstSI , "v_max_legacy_f32", VOP_F32_F32_F32, AMDGPUfmax_legacy >; let isCommutable = 1 in { defm V_LSHR_B32 : VOP2InstSI , "v_lshr_b32", VOP_I32_I32_I32>; defm V_ASHR_I32 : VOP2InstSI , "v_ashr_i32", VOP_I32_I32_I32>; defm V_LSHL_B32 : VOP2InstSI , "v_lshl_b32", VOP_I32_I32_I32>; } // End isCommutable = 1 } // End let SubtargetPredicate = SICI defm V_BFM_B32 : VOP2_VI3_Inst , "v_bfm_b32", VOP_I32_I32_I32 >; defm V_BCNT_U32_B32 : VOP2_VI3_Inst , "v_bcnt_u32_b32", VOP_I32_I32_I32 >; defm V_MBCNT_LO_U32_B32 : VOP2_VI3_Inst , "v_mbcnt_lo_u32_b32", VOP_I32_I32_I32, int_amdgcn_mbcnt_lo >; defm V_MBCNT_HI_U32_B32 : VOP2_VI3_Inst , "v_mbcnt_hi_u32_b32", VOP_I32_I32_I32, int_amdgcn_mbcnt_hi >; defm V_LDEXP_F32 : VOP2_VI3_Inst , "v_ldexp_f32", VOP_F32_F32_I32, AMDGPUldexp >; defm V_CVT_PKACCUM_U8_F32 : VOP2_VI3_Inst , "v_cvt_pkaccum_u8_f32", VOP_I32_F32_I32>; // TODO: set "Uses = dst" defm V_CVT_PKNORM_I16_F32 : VOP2_VI3_Inst , "v_cvt_pknorm_i16_f32", VOP_I32_F32_F32 >; defm V_CVT_PKNORM_U16_F32 : VOP2_VI3_Inst , "v_cvt_pknorm_u16_f32", VOP_I32_F32_F32 >; defm V_CVT_PKRTZ_F16_F32 : VOP2_VI3_Inst , "v_cvt_pkrtz_f16_f32", VOP_I32_F32_F32, int_SI_packf16 >; defm V_CVT_PK_U16_U32 : VOP2_VI3_Inst , "v_cvt_pk_u16_u32", VOP_I32_I32_I32 >; defm V_CVT_PK_I16_I32 : VOP2_VI3_Inst , "v_cvt_pk_i16_i32", VOP_I32_I32_I32 >; //===----------------------------------------------------------------------===// // VOP3 Instructions //===----------------------------------------------------------------------===// let isCommutable = 1 in { defm V_MAD_LEGACY_F32 : VOP3Inst , "v_mad_legacy_f32", VOP_F32_F32_F32_F32 >; defm V_MAD_F32 : VOP3Inst , "v_mad_f32", VOP_F32_F32_F32_F32, fmad >; defm V_MAD_I32_I24 : VOP3Inst , "v_mad_i32_i24", VOP_I32_I32_I32_I32, AMDGPUmad_i24 >; defm V_MAD_U32_U24 : VOP3Inst , "v_mad_u32_u24", VOP_I32_I32_I32_I32, AMDGPUmad_u24 >; } // End isCommutable = 1 defm V_CUBEID_F32 : VOP3Inst , "v_cubeid_f32", VOP_F32_F32_F32_F32, int_amdgcn_cubeid >; defm V_CUBESC_F32 : VOP3Inst , "v_cubesc_f32", VOP_F32_F32_F32_F32, int_amdgcn_cubesc >; defm V_CUBETC_F32 : VOP3Inst , "v_cubetc_f32", VOP_F32_F32_F32_F32, int_amdgcn_cubetc >; defm V_CUBEMA_F32 : VOP3Inst , "v_cubema_f32", VOP_F32_F32_F32_F32, int_amdgcn_cubema >; defm V_BFE_U32 : VOP3Inst , "v_bfe_u32", VOP_I32_I32_I32_I32, AMDGPUbfe_u32 >; defm V_BFE_I32 : VOP3Inst , "v_bfe_i32", VOP_I32_I32_I32_I32, AMDGPUbfe_i32 >; defm V_BFI_B32 : VOP3Inst , "v_bfi_b32", VOP_I32_I32_I32_I32, AMDGPUbfi >; let isCommutable = 1 in { defm V_FMA_F32 : VOP3Inst , "v_fma_f32", VOP_F32_F32_F32_F32, fma >; defm V_FMA_F64 : VOP3Inst , "v_fma_f64", VOP_F64_F64_F64_F64, fma >; defm V_LERP_U8 : VOP3Inst , "v_lerp_u8", VOP_I32_I32_I32_I32, int_amdgcn_lerp >; } // End isCommutable = 1 //def V_LERP_U8 : VOP3_U8 <0x0000014d, "v_lerp_u8", []>; defm V_ALIGNBIT_B32 : VOP3Inst , "v_alignbit_b32", VOP_I32_I32_I32_I32 >; defm V_ALIGNBYTE_B32 : VOP3Inst , "v_alignbyte_b32", VOP_I32_I32_I32_I32 >; defm V_MIN3_F32 : VOP3Inst , "v_min3_f32", VOP_F32_F32_F32_F32, AMDGPUfmin3>; defm V_MIN3_I32 : VOP3Inst , "v_min3_i32", VOP_I32_I32_I32_I32, AMDGPUsmin3 >; defm V_MIN3_U32 : VOP3Inst , "v_min3_u32", VOP_I32_I32_I32_I32, AMDGPUumin3 >; defm V_MAX3_F32 : VOP3Inst , "v_max3_f32", VOP_F32_F32_F32_F32, AMDGPUfmax3 >; defm V_MAX3_I32 : VOP3Inst , "v_max3_i32", VOP_I32_I32_I32_I32, AMDGPUsmax3 >; defm V_MAX3_U32 : VOP3Inst , "v_max3_u32", VOP_I32_I32_I32_I32, AMDGPUumax3 >; defm V_MED3_F32 : VOP3Inst , "v_med3_f32", VOP_F32_F32_F32_F32, AMDGPUfmed3 >; defm V_MED3_I32 : VOP3Inst , "v_med3_i32", VOP_I32_I32_I32_I32, AMDGPUsmed3 >; defm V_MED3_U32 : VOP3Inst , "v_med3_u32", VOP_I32_I32_I32_I32, AMDGPUumed3 >; //def V_SAD_U8 : VOP3_U8 <0x0000015a, "v_sad_u8", []>; //def V_SAD_HI_U8 : VOP3_U8 <0x0000015b, "v_sad_hi_u8", []>; //def V_SAD_U16 : VOP3_U16 <0x0000015c, "v_sad_u16", []>; defm V_SAD_U32 : VOP3Inst , "v_sad_u32", VOP_I32_I32_I32_I32 >; //def V_CVT_PK_U8_F32 : VOP3_U8 <0x0000015e, "v_cvt_pk_u8_f32", []>; defm V_DIV_FIXUP_F32 : VOP3Inst < vop3<0x15f, 0x1de>, "v_div_fixup_f32", VOP_F32_F32_F32_F32, AMDGPUdiv_fixup >; let SchedRW = [WriteDoubleAdd] in { defm V_DIV_FIXUP_F64 : VOP3Inst < vop3<0x160, 0x1df>, "v_div_fixup_f64", VOP_F64_F64_F64_F64, AMDGPUdiv_fixup >; } // End SchedRW = [WriteDouble] let SchedRW = [WriteDoubleAdd] in { let isCommutable = 1 in { defm V_ADD_F64 : VOP3Inst , "v_add_f64", VOP_F64_F64_F64, fadd, 1 >; defm V_MUL_F64 : VOP3Inst , "v_mul_f64", VOP_F64_F64_F64, fmul, 1 >; defm V_MIN_F64 : VOP3Inst , "v_min_f64", VOP_F64_F64_F64, fminnum, 1 >; defm V_MAX_F64 : VOP3Inst , "v_max_f64", VOP_F64_F64_F64, fmaxnum, 1 >; } // End isCommutable = 1 defm V_LDEXP_F64 : VOP3Inst , "v_ldexp_f64", VOP_F64_F64_I32, AMDGPUldexp, 1 >; } // End let SchedRW = [WriteDoubleAdd] let isCommutable = 1, SchedRW = [WriteQuarterRate32] in { defm V_MUL_LO_U32 : VOP3Inst , "v_mul_lo_u32", VOP_I32_I32_I32 >; defm V_MUL_HI_U32 : VOP3Inst , "v_mul_hi_u32", VOP_I32_I32_I32, mulhu >; let DisableVIDecoder=1 in { // removed from VI as identical to V_MUL_LO_U32 defm V_MUL_LO_I32 : VOP3Inst , "v_mul_lo_i32", VOP_I32_I32_I32 >; } defm V_MUL_HI_I32 : VOP3Inst , "v_mul_hi_i32", VOP_I32_I32_I32, mulhs >; } // End isCommutable = 1, SchedRW = [WriteQuarterRate32] let SchedRW = [WriteFloatFMA, WriteSALU] in { defm V_DIV_SCALE_F32 : VOP3bInst , "v_div_scale_f32", VOP3b_F32_I1_F32_F32_F32, [], 1 >; } let SchedRW = [WriteDouble, WriteSALU] in { // Double precision division pre-scale. defm V_DIV_SCALE_F64 : VOP3bInst , "v_div_scale_f64", VOP3b_F64_I1_F64_F64_F64, [], 1 >; } // End SchedRW = [WriteDouble] let isCommutable = 1, Uses = [VCC, EXEC] in { let SchedRW = [WriteFloatFMA] in { // v_div_fmas_f32: // result = src0 * src1 + src2 // if (vcc) // result *= 2^32 // defm V_DIV_FMAS_F32 : VOP3_VCC_Inst , "v_div_fmas_f32", VOP_F32_F32_F32_F32, AMDGPUdiv_fmas >; } let SchedRW = [WriteDouble] in { // v_div_fmas_f64: // result = src0 * src1 + src2 // if (vcc) // result *= 2^64 // defm V_DIV_FMAS_F64 : VOP3_VCC_Inst , "v_div_fmas_f64", VOP_F64_F64_F64_F64, AMDGPUdiv_fmas >; } // End SchedRW = [WriteDouble] } // End isCommutable = 1, Uses = [VCC, EXEC] //def V_MSAD_U8 : VOP3_U8 <0x00000171, "v_msad_u8", []>; //def V_QSAD_U8 : VOP3_U8 <0x00000172, "v_qsad_u8", []>; //def V_MQSAD_U8 : VOP3_U8 <0x00000173, "v_mqsad_u8", []>; let SchedRW = [WriteDouble] in { defm V_TRIG_PREOP_F64 : VOP3Inst < vop3<0x174, 0x292>, "v_trig_preop_f64", VOP_F64_F64_I32, AMDGPUtrig_preop >; } // End SchedRW = [WriteDouble] // These instructions only exist on SI and CI let SubtargetPredicate = isSICI in { defm V_LSHL_B64 : VOP3Inst , "v_lshl_b64", VOP_I64_I64_I32>; defm V_LSHR_B64 : VOP3Inst , "v_lshr_b64", VOP_I64_I64_I32>; defm V_ASHR_I64 : VOP3Inst , "v_ashr_i64", VOP_I64_I64_I32>; defm V_MULLIT_F32 : VOP3Inst , "v_mullit_f32", VOP_F32_F32_F32_F32>; } // End SubtargetPredicate = isSICI let SubtargetPredicate = isVI, DisableSIDecoder = 1 in { defm V_LSHLREV_B64 : VOP3Inst , "v_lshlrev_b64", VOP_I64_I32_I64 >; defm V_LSHRREV_B64 : VOP3Inst , "v_lshrrev_b64", VOP_I64_I32_I64 >; defm V_ASHRREV_I64 : VOP3Inst , "v_ashrrev_i64", VOP_I64_I32_I64 >; } // End SubtargetPredicate = isVI //===----------------------------------------------------------------------===// // Pseudo Instructions //===----------------------------------------------------------------------===// let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] in { // For use in patterns def V_CNDMASK_B64_PSEUDO : VOP3Common <(outs VReg_64:$vdst), (ins VSrc_64:$src0, VSrc_64:$src1, SSrc_64:$src2), "", []> { let isPseudo = 1; let isCodeGenOnly = 1; } // 64-bit vector move instruction. This is mainly used by the SIFoldOperands // pass to enable folding of inline immediates. def V_MOV_B64_PSEUDO : PseudoInstSI <(outs VReg_64:$vdst), (ins VSrc_64:$src0)> { let VALU = 1; } } // End let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] let usesCustomInserter = 1, SALU = 1 in { def GET_GROUPSTATICSIZE : PseudoInstSI <(outs SReg_32:$sdst), (ins), [(set SReg_32:$sdst, (int_amdgcn_groupstaticsize))]>; } // End let usesCustomInserter = 1, SALU = 1 // SI pseudo instructions. These are used by the CFG structurizer pass // and should be lowered to ISA instructions prior to codegen. let hasSideEffects = 1 in { // Dummy terminator instruction to use after control flow instructions // replaced with exec mask operations. def SI_MASK_BRANCH : PseudoInstSI < (outs), (ins brtarget:$target, SReg_64:$dst)> { let isBranch = 1; let isTerminator = 1; let isBarrier = 1; let SALU = 1; } let Uses = [EXEC], Defs = [EXEC, SCC] in { let isBranch = 1, isTerminator = 1 in { def SI_IF: PseudoInstSI < (outs SReg_64:$dst), (ins SReg_64:$vcc, brtarget:$target), [(set i64:$dst, (int_amdgcn_if i1:$vcc, bb:$target))]> { let Constraints = ""; } def SI_ELSE : PseudoInstSI < (outs SReg_64:$dst), (ins SReg_64:$src, brtarget:$target), [(set i64:$dst, (int_amdgcn_else i64:$src, bb:$target))]> { let Constraints = "$src = $dst"; } def SI_LOOP : PseudoInstSI < (outs), (ins SReg_64:$saved, brtarget:$target), [(int_amdgcn_loop i64:$saved, bb:$target)] >; } // End isBranch = 1, isTerminator = 1 def SI_BREAK : PseudoInstSI < (outs SReg_64:$dst), (ins SReg_64:$src), [(set i64:$dst, (int_amdgcn_break i64:$src))] >; def SI_IF_BREAK : PseudoInstSI < (outs SReg_64:$dst), (ins SReg_64:$vcc, SReg_64:$src), [(set i64:$dst, (int_amdgcn_if_break i1:$vcc, i64:$src))] >; def SI_ELSE_BREAK : PseudoInstSI < (outs SReg_64:$dst), (ins SReg_64:$src0, SReg_64:$src1), [(set i64:$dst, (int_amdgcn_else_break i64:$src0, i64:$src1))] >; def SI_END_CF : PseudoInstSI < (outs), (ins SReg_64:$saved), [(int_amdgcn_end_cf i64:$saved)] >; } // End Uses = [EXEC], Defs = [EXEC, SCC] let Uses = [EXEC], Defs = [EXEC,VCC] in { def SI_KILL : PseudoInstSI < (outs), (ins VSrc_32:$src), [(int_AMDGPU_kill f32:$src)]> { let isConvergent = 1; let usesCustomInserter = 1; } def SI_KILL_TERMINATOR : PseudoInstSI < (outs), (ins VSrc_32:$src)> { let isTerminator = 1; } } // End Uses = [EXEC], Defs = [EXEC,VCC] } // End mayLoad = 1, mayStore = 1, hasSideEffects = 1 def SI_PS_LIVE : PseudoInstSI < (outs SReg_64:$dst), (ins), [(set i1:$dst, (int_amdgcn_ps_live))]> { let SALU = 1; } // Used as an isel pseudo to directly emit initialization with an // s_mov_b32 rather than a copy of another initialized // register. MachineCSE skips copies, and we don't want to have to // fold operands before it runs. def SI_INIT_M0 : PseudoInstSI <(outs), (ins SSrc_32:$src)> { let Defs = [M0]; let usesCustomInserter = 1; let isAsCheapAsAMove = 1; let SALU = 1; let isReMaterializable = 1; } def SI_RETURN : PseudoInstSI < (outs), (ins variable_ops), [(AMDGPUreturn)]> { let isTerminator = 1; let isBarrier = 1; let isReturn = 1; let hasSideEffects = 1; let SALU = 1; let hasNoSchedulingInfo = 1; + let DisableWQM = 1; } let Uses = [EXEC], Defs = [EXEC, VCC, M0], UseNamedOperandTable = 1 in { class SI_INDIRECT_SRC : PseudoInstSI < (outs VGPR_32:$vdst, SReg_64:$sdst), (ins rc:$src, VS_32:$idx, i32imm:$offset)>; class SI_INDIRECT_DST : PseudoInstSI < (outs rc:$vdst, SReg_64:$sdst), (ins unknown:$src, VS_32:$idx, i32imm:$offset, VGPR_32:$val)> { let Constraints = "$src = $vdst"; } // TODO: We can support indirect SGPR access. def SI_INDIRECT_SRC_V1 : SI_INDIRECT_SRC; def SI_INDIRECT_SRC_V2 : SI_INDIRECT_SRC; def SI_INDIRECT_SRC_V4 : SI_INDIRECT_SRC; def SI_INDIRECT_SRC_V8 : SI_INDIRECT_SRC; def SI_INDIRECT_SRC_V16 : SI_INDIRECT_SRC; def SI_INDIRECT_DST_V1 : SI_INDIRECT_DST; def SI_INDIRECT_DST_V2 : SI_INDIRECT_DST; def SI_INDIRECT_DST_V4 : SI_INDIRECT_DST; def SI_INDIRECT_DST_V8 : SI_INDIRECT_DST; def SI_INDIRECT_DST_V16 : SI_INDIRECT_DST; } // End Uses = [EXEC], Defs = [EXEC,VCC,M0] multiclass SI_SPILL_SGPR { let UseNamedOperandTable = 1, Uses = [EXEC] in { def _SAVE : PseudoInstSI < (outs), (ins sgpr_class:$src, i32imm:$frame_idx)> { let mayStore = 1; let mayLoad = 0; } def _RESTORE : PseudoInstSI < (outs sgpr_class:$dst), (ins i32imm:$frame_idx)> { let mayStore = 0; let mayLoad = 1; } } // End UseNamedOperandTable = 1 } // It's unclear whether you can use M0 as the output of v_readlane_b32 // instructions, so use SReg_32_XM0 register class for spills to prevent // this from happening. defm SI_SPILL_S32 : SI_SPILL_SGPR ; defm SI_SPILL_S64 : SI_SPILL_SGPR ; defm SI_SPILL_S128 : SI_SPILL_SGPR ; defm SI_SPILL_S256 : SI_SPILL_SGPR ; defm SI_SPILL_S512 : SI_SPILL_SGPR ; multiclass SI_SPILL_VGPR { let UseNamedOperandTable = 1, VGPRSpill = 1, Uses = [EXEC] in { def _SAVE : PseudoInstSI < (outs), (ins vgpr_class:$src, i32imm:$frame_idx, SReg_128:$scratch_rsrc, SReg_32:$scratch_offset, i32imm:$offset)> { let mayStore = 1; let mayLoad = 0; } def _RESTORE : PseudoInstSI < (outs vgpr_class:$dst), (ins i32imm:$frame_idx, SReg_128:$scratch_rsrc, SReg_32:$scratch_offset, i32imm:$offset)> { let mayStore = 0; let mayLoad = 1; } } // End UseNamedOperandTable = 1, VGPRSpill = 1 } defm SI_SPILL_V32 : SI_SPILL_VGPR ; defm SI_SPILL_V64 : SI_SPILL_VGPR ; defm SI_SPILL_V96 : SI_SPILL_VGPR ; defm SI_SPILL_V128 : SI_SPILL_VGPR ; defm SI_SPILL_V256 : SI_SPILL_VGPR ; defm SI_SPILL_V512 : SI_SPILL_VGPR ; let Defs = [SCC] in { def SI_PC_ADD_REL_OFFSET : PseudoInstSI < (outs SReg_64:$dst), (ins si_ga:$ptr), [(set SReg_64:$dst, (i64 (SIpc_add_rel_offset (tglobaladdr:$ptr))))]> { let SALU = 1; } } // End Defs = [SCC] } // End SubtargetPredicate = isGCN let Predicates = [isGCN] in { def : Pat < (int_AMDGPU_kilp), (SI_KILL 0xbf800000) >; /* int_SI_vs_load_input */ def : Pat< (SIload_input v4i32:$tlst, imm:$attr_offset, i32:$buf_idx_vgpr), (BUFFER_LOAD_FORMAT_XYZW_IDXEN $buf_idx_vgpr, $tlst, 0, imm:$attr_offset, 0, 0, 0) >; def : Pat < (int_SI_export imm:$en, imm:$vm, imm:$done, imm:$tgt, imm:$compr, f32:$src0, f32:$src1, f32:$src2, f32:$src3), (EXP imm:$en, imm:$tgt, imm:$compr, imm:$done, imm:$vm, $src0, $src1, $src2, $src3) >; //===----------------------------------------------------------------------===// // buffer_load/store_format patterns //===----------------------------------------------------------------------===// multiclass MUBUF_LoadIntrinsicPat { def : Pat< (vt (name v4i32:$rsrc, 0, (MUBUFIntrinsicOffset i32:$soffset, i16:$offset), imm:$glc, imm:$slc)), (!cast(opcode # _OFFSET) $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $glc), (as_i1imm $slc), 0) >; def : Pat< (vt (name v4i32:$rsrc, i32:$vindex, (MUBUFIntrinsicOffset i32:$soffset, i16:$offset), imm:$glc, imm:$slc)), (!cast(opcode # _IDXEN) $vindex, $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $glc), (as_i1imm $slc), 0) >; def : Pat< (vt (name v4i32:$rsrc, 0, (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset), imm:$glc, imm:$slc)), (!cast(opcode # _OFFEN) $voffset, $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $glc), (as_i1imm $slc), 0) >; def : Pat< (vt (name v4i32:$rsrc, i32:$vindex, (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset), imm:$glc, imm:$slc)), (!cast(opcode # _BOTHEN) (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1), $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $glc), (as_i1imm $slc), 0) >; } defm : MUBUF_LoadIntrinsicPat; defm : MUBUF_LoadIntrinsicPat; defm : MUBUF_LoadIntrinsicPat; defm : MUBUF_LoadIntrinsicPat; defm : MUBUF_LoadIntrinsicPat; defm : MUBUF_LoadIntrinsicPat; multiclass MUBUF_StoreIntrinsicPat { def : Pat< (name vt:$vdata, v4i32:$rsrc, 0, (MUBUFIntrinsicOffset i32:$soffset, i16:$offset), imm:$glc, imm:$slc), (!cast(opcode # _OFFSET_exact) $vdata, $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $glc), (as_i1imm $slc), 0) >; def : Pat< (name vt:$vdata, v4i32:$rsrc, i32:$vindex, (MUBUFIntrinsicOffset i32:$soffset, i16:$offset), imm:$glc, imm:$slc), (!cast(opcode # _IDXEN_exact) $vdata, $vindex, $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $glc), (as_i1imm $slc), 0) >; def : Pat< (name vt:$vdata, v4i32:$rsrc, 0, (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset), imm:$glc, imm:$slc), (!cast(opcode # _OFFEN_exact) $vdata, $voffset, $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $glc), (as_i1imm $slc), 0) >; def : Pat< (name vt:$vdata, v4i32:$rsrc, i32:$vindex, (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset), imm:$glc, imm:$slc), (!cast(opcode # _BOTHEN_exact) $vdata, (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1), $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $glc), (as_i1imm $slc), 0) >; } defm : MUBUF_StoreIntrinsicPat; defm : MUBUF_StoreIntrinsicPat; defm : MUBUF_StoreIntrinsicPat; defm : MUBUF_StoreIntrinsicPat; defm : MUBUF_StoreIntrinsicPat; defm : MUBUF_StoreIntrinsicPat; //===----------------------------------------------------------------------===// // buffer_atomic patterns //===----------------------------------------------------------------------===// multiclass BufferAtomicPatterns { def : Pat< (name i32:$vdata_in, v4i32:$rsrc, 0, (MUBUFIntrinsicOffset i32:$soffset, i16:$offset), imm:$slc), (!cast(opcode # _RTN_OFFSET) $vdata_in, $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $slc)) >; def : Pat< (name i32:$vdata_in, v4i32:$rsrc, i32:$vindex, (MUBUFIntrinsicOffset i32:$soffset, i16:$offset), imm:$slc), (!cast(opcode # _RTN_IDXEN) $vdata_in, $vindex, $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $slc)) >; def : Pat< (name i32:$vdata_in, v4i32:$rsrc, 0, (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset), imm:$slc), (!cast(opcode # _RTN_OFFEN) $vdata_in, $voffset, $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $slc)) >; def : Pat< (name i32:$vdata_in, v4i32:$rsrc, i32:$vindex, (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset), imm:$slc), (!cast(opcode # _RTN_BOTHEN) $vdata_in, (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1), $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $slc)) >; } defm : BufferAtomicPatterns; defm : BufferAtomicPatterns; defm : BufferAtomicPatterns; defm : BufferAtomicPatterns; defm : BufferAtomicPatterns; defm : BufferAtomicPatterns; defm : BufferAtomicPatterns; defm : BufferAtomicPatterns; defm : BufferAtomicPatterns; defm : BufferAtomicPatterns; def : Pat< (int_amdgcn_buffer_atomic_cmpswap i32:$data, i32:$cmp, v4i32:$rsrc, 0, (MUBUFIntrinsicOffset i32:$soffset, i16:$offset), imm:$slc), (EXTRACT_SUBREG (BUFFER_ATOMIC_CMPSWAP_RTN_OFFSET (REG_SEQUENCE VReg_64, $data, sub0, $cmp, sub1), $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $slc)), sub0) >; def : Pat< (int_amdgcn_buffer_atomic_cmpswap i32:$data, i32:$cmp, v4i32:$rsrc, i32:$vindex, (MUBUFIntrinsicOffset i32:$soffset, i16:$offset), imm:$slc), (EXTRACT_SUBREG (BUFFER_ATOMIC_CMPSWAP_RTN_IDXEN (REG_SEQUENCE VReg_64, $data, sub0, $cmp, sub1), $vindex, $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $slc)), sub0) >; def : Pat< (int_amdgcn_buffer_atomic_cmpswap i32:$data, i32:$cmp, v4i32:$rsrc, 0, (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset), imm:$slc), (EXTRACT_SUBREG (BUFFER_ATOMIC_CMPSWAP_RTN_OFFEN (REG_SEQUENCE VReg_64, $data, sub0, $cmp, sub1), $voffset, $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $slc)), sub0) >; def : Pat< (int_amdgcn_buffer_atomic_cmpswap i32:$data, i32:$cmp, v4i32:$rsrc, i32:$vindex, (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset), imm:$slc), (EXTRACT_SUBREG (BUFFER_ATOMIC_CMPSWAP_RTN_BOTHEN (REG_SEQUENCE VReg_64, $data, sub0, $cmp, sub1), (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1), $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $slc)), sub0) >; //===----------------------------------------------------------------------===// // S_GETREG_B32 Intrinsic Pattern. //===----------------------------------------------------------------------===// def : Pat < (int_amdgcn_s_getreg imm:$simm16), (S_GETREG_B32 (as_i16imm $simm16)) >; //===----------------------------------------------------------------------===// // DS_SWIZZLE Intrinsic Pattern. //===----------------------------------------------------------------------===// def : Pat < (int_amdgcn_ds_swizzle i32:$src, imm:$offset16), (DS_SWIZZLE_B32 $src, (as_i16imm $offset16), (i1 0)) >; //===----------------------------------------------------------------------===// // SMRD Patterns //===----------------------------------------------------------------------===// multiclass SMRD_Pattern { // 1. IMM offset def : Pat < (smrd_load (SMRDImm i64:$sbase, i32:$offset)), (vt (!cast(Instr#"_IMM") $sbase, $offset)) >; // 2. SGPR offset def : Pat < (smrd_load (SMRDSgpr i64:$sbase, i32:$offset)), (vt (!cast(Instr#"_SGPR") $sbase, $offset)) >; def : Pat < (smrd_load (SMRDImm32 i64:$sbase, i32:$offset)), (vt (!cast(Instr#"_IMM_ci") $sbase, $offset)) > { let Predicates = [isCIOnly]; } } // Global and constant loads can be selected to either MUBUF or SMRD // instructions, but SMRD instructions are faster so we want the instruction // selector to prefer those. let AddedComplexity = 100 in { defm : SMRD_Pattern <"S_LOAD_DWORD", i32>; defm : SMRD_Pattern <"S_LOAD_DWORDX2", v2i32>; defm : SMRD_Pattern <"S_LOAD_DWORDX4", v4i32>; defm : SMRD_Pattern <"S_LOAD_DWORDX8", v8i32>; defm : SMRD_Pattern <"S_LOAD_DWORDX16", v16i32>; // 1. Offset as an immediate def : Pat < (SIload_constant v4i32:$sbase, (SMRDBufferImm i32:$offset)), (S_BUFFER_LOAD_DWORD_IMM $sbase, $offset) >; // 2. Offset loaded in an 32bit SGPR def : Pat < (SIload_constant v4i32:$sbase, (SMRDBufferSgpr i32:$offset)), (S_BUFFER_LOAD_DWORD_SGPR $sbase, $offset) >; let Predicates = [isCI] in { def : Pat < (SIload_constant v4i32:$sbase, (SMRDBufferImm32 i32:$offset)), (S_BUFFER_LOAD_DWORD_IMM_ci $sbase, $offset) >; } // End Predicates = [isCI] } // End let AddedComplexity = 10000 //===----------------------------------------------------------------------===// // SOP1 Patterns //===----------------------------------------------------------------------===// def : Pat < (i64 (ctpop i64:$src)), (i64 (REG_SEQUENCE SReg_64, (i32 (COPY_TO_REGCLASS (S_BCNT1_I32_B64 $src), SReg_32)), sub0, (S_MOV_B32 0), sub1)) >; def : Pat < (i32 (smax i32:$x, (i32 (ineg i32:$x)))), (S_ABS_I32 $x) >; //===----------------------------------------------------------------------===// // SOP2 Patterns //===----------------------------------------------------------------------===// // V_ADD_I32_e32/S_ADD_U32 produces carry in VCC/SCC. For the vector // case, the sgpr-copies pass will fix this to use the vector version. def : Pat < (i32 (addc i32:$src0, i32:$src1)), (S_ADD_U32 $src0, $src1) >; //===----------------------------------------------------------------------===// // SOPP Patterns //===----------------------------------------------------------------------===// def : Pat < (int_amdgcn_s_waitcnt i32:$simm16), (S_WAITCNT (as_i16imm $simm16)) >; //===----------------------------------------------------------------------===// // VOP1 Patterns //===----------------------------------------------------------------------===// let Predicates = [UnsafeFPMath] in { //def : RcpPat; //defm : RsqPat; //defm : RsqPat; def : RsqPat; def : RsqPat; // Convert (x - floor(x)) to fract(x) def : Pat < (f32 (fsub (f32 (VOP3Mods f32:$x, i32:$mods)), (f32 (ffloor (f32 (VOP3Mods f32:$x, i32:$mods)))))), (V_FRACT_F32_e64 $mods, $x, DSTCLAMP.NONE, DSTOMOD.NONE) >; // Convert (x + (-floor(x))) to fract(x) def : Pat < (f64 (fadd (f64 (VOP3Mods f64:$x, i32:$mods)), (f64 (fneg (f64 (ffloor (f64 (VOP3Mods f64:$x, i32:$mods)))))))), (V_FRACT_F64_e64 $mods, $x, DSTCLAMP.NONE, DSTOMOD.NONE) >; } // End Predicates = [UnsafeFPMath] //===----------------------------------------------------------------------===// // VOP2 Patterns //===----------------------------------------------------------------------===// def : Pat < (i32 (add (i32 (ctpop i32:$popcnt)), i32:$val)), (V_BCNT_U32_B32_e64 $popcnt, $val) >; def : Pat < (i32 (select i1:$src0, i32:$src1, i32:$src2)), (V_CNDMASK_B32_e64 $src2, $src1, $src0) >; // Pattern for V_MAC_F32 def : Pat < (fmad (VOP3NoMods0 f32:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod), (VOP3NoMods f32:$src1, i32:$src1_modifiers), (VOP3NoMods f32:$src2, i32:$src2_modifiers)), (V_MAC_F32_e64 $src0_modifiers, $src0, $src1_modifiers, $src1, $src2_modifiers, $src2, $clamp, $omod) >; /********** ======================= **********/ /********** Image sampling patterns **********/ /********** ======================= **********/ // Image + sampler class SampleRawPattern : Pat < (name vt:$addr, v8i32:$rsrc, v4i32:$sampler, i32:$dmask, i32:$unorm, i32:$r128, i32:$da, i32:$glc, i32:$slc, i32:$tfe, i32:$lwe), (opcode $addr, $rsrc, $sampler, (as_i32imm $dmask), (as_i1imm $unorm), (as_i1imm $glc), (as_i1imm $slc), (as_i1imm $r128), (as_i1imm $tfe), (as_i1imm $lwe), (as_i1imm $da)) >; multiclass SampleRawPatterns { def : SampleRawPattern(opcode # _V4_V1), i32>; def : SampleRawPattern(opcode # _V4_V2), v2i32>; def : SampleRawPattern(opcode # _V4_V4), v4i32>; def : SampleRawPattern(opcode # _V4_V8), v8i32>; def : SampleRawPattern(opcode # _V4_V16), v16i32>; } // Image only class ImagePattern : Pat < (name vt:$addr, v8i32:$rsrc, imm:$dmask, imm:$unorm, imm:$r128, imm:$da, imm:$glc, imm:$slc, imm:$tfe, imm:$lwe), (opcode $addr, $rsrc, (as_i32imm $dmask), (as_i1imm $unorm), (as_i1imm $glc), (as_i1imm $slc), (as_i1imm $r128), (as_i1imm $tfe), (as_i1imm $lwe), (as_i1imm $da)) >; multiclass ImagePatterns { def : ImagePattern(opcode # _V4_V1), i32>; def : ImagePattern(opcode # _V4_V2), v2i32>; def : ImagePattern(opcode # _V4_V4), v4i32>; } class ImageLoadPattern : Pat < (name vt:$addr, v8i32:$rsrc, imm:$dmask, imm:$r128, imm:$da, imm:$glc, imm:$slc), (opcode $addr, $rsrc, (as_i32imm $dmask), 1, (as_i1imm $glc), (as_i1imm $slc), (as_i1imm $r128), 0, 0, (as_i1imm $da)) >; multiclass ImageLoadPatterns { def : ImageLoadPattern(opcode # _V4_V1), i32>; def : ImageLoadPattern(opcode # _V4_V2), v2i32>; def : ImageLoadPattern(opcode # _V4_V4), v4i32>; } class ImageStorePattern : Pat < (name v4f32:$data, vt:$addr, v8i32:$rsrc, i32:$dmask, imm:$r128, imm:$da, imm:$glc, imm:$slc), (opcode $data, $addr, $rsrc, (as_i32imm $dmask), 1, (as_i1imm $glc), (as_i1imm $slc), (as_i1imm $r128), 0, 0, (as_i1imm $da)) >; multiclass ImageStorePatterns { def : ImageStorePattern(opcode # _V4_V1), i32>; def : ImageStorePattern(opcode # _V4_V2), v2i32>; def : ImageStorePattern(opcode # _V4_V4), v4i32>; } class ImageAtomicPattern : Pat < (name i32:$vdata, vt:$addr, v8i32:$rsrc, imm:$r128, imm:$da, imm:$slc), (opcode $vdata, $addr, $rsrc, 1, 1, 1, (as_i1imm $slc), (as_i1imm $r128), 0, 0, (as_i1imm $da)) >; multiclass ImageAtomicPatterns { def : ImageAtomicPattern(opcode # _V1), i32>; def : ImageAtomicPattern(opcode # _V2), v2i32>; def : ImageAtomicPattern(opcode # _V4), v4i32>; } class ImageAtomicCmpSwapPattern : Pat < (int_amdgcn_image_atomic_cmpswap i32:$vsrc, i32:$vcmp, vt:$addr, v8i32:$rsrc, imm:$r128, imm:$da, imm:$slc), (EXTRACT_SUBREG (opcode (REG_SEQUENCE VReg_64, $vsrc, sub0, $vcmp, sub1), $addr, $rsrc, 3, 1, 1, (as_i1imm $slc), (as_i1imm $r128), 0, 0, (as_i1imm $da)), sub0) >; // Basic sample defm : SampleRawPatterns; defm : SampleRawPatterns; defm : SampleRawPatterns; defm : SampleRawPatterns; defm : SampleRawPatterns; defm : SampleRawPatterns; defm : SampleRawPatterns; defm : SampleRawPatterns; defm : SampleRawPatterns; defm : SampleRawPatterns; // Sample with comparison defm : SampleRawPatterns; defm : SampleRawPatterns; defm : SampleRawPatterns; defm : SampleRawPatterns; defm : SampleRawPatterns; defm : SampleRawPatterns; defm : SampleRawPatterns; defm : SampleRawPatterns; defm : SampleRawPatterns; defm : SampleRawPatterns; // Sample with offsets defm : SampleRawPatterns; defm : SampleRawPatterns; defm : SampleRawPatterns; defm : SampleRawPatterns; defm : SampleRawPatterns; defm : SampleRawPatterns; defm : SampleRawPatterns; defm : SampleRawPatterns; defm : SampleRawPatterns; defm : SampleRawPatterns; // Sample with comparison and offsets defm : SampleRawPatterns; defm : SampleRawPatterns; defm : SampleRawPatterns; defm : SampleRawPatterns; defm : SampleRawPatterns; defm : SampleRawPatterns; defm : SampleRawPatterns; defm : SampleRawPatterns; defm : SampleRawPatterns; defm : SampleRawPatterns; // Gather opcodes // Only the variants which make sense are defined. def : SampleRawPattern; def : SampleRawPattern; def : SampleRawPattern; def : SampleRawPattern; def : SampleRawPattern; def : SampleRawPattern; def : SampleRawPattern; def : SampleRawPattern; def : SampleRawPattern; def : SampleRawPattern; def : SampleRawPattern; def : SampleRawPattern; def : SampleRawPattern; def : SampleRawPattern; def : SampleRawPattern; def : SampleRawPattern; def : SampleRawPattern; def : SampleRawPattern; def : SampleRawPattern; def : SampleRawPattern; def : SampleRawPattern; def : SampleRawPattern; def : SampleRawPattern; def : SampleRawPattern; def : SampleRawPattern; def : SampleRawPattern; def : SampleRawPattern; def : SampleRawPattern; def : SampleRawPattern; def : SampleRawPattern; def : SampleRawPattern; def : SampleRawPattern; def : SampleRawPattern; def : SampleRawPattern; def : SampleRawPattern; def : SampleRawPattern; def : SampleRawPattern; def : SampleRawPattern; def : ImagePattern; defm : ImagePatterns; defm : ImagePatterns; defm : ImageLoadPatterns; defm : ImageLoadPatterns; defm : ImageStorePatterns; defm : ImageStorePatterns; defm : ImageAtomicPatterns; def : ImageAtomicCmpSwapPattern; def : ImageAtomicCmpSwapPattern; def : ImageAtomicCmpSwapPattern; defm : ImageAtomicPatterns; defm : ImageAtomicPatterns; defm : ImageAtomicPatterns; defm : ImageAtomicPatterns; defm : ImageAtomicPatterns; defm : ImageAtomicPatterns; defm : ImageAtomicPatterns; defm : ImageAtomicPatterns; defm : ImageAtomicPatterns; defm : ImageAtomicPatterns; defm : ImageAtomicPatterns; /* SIsample for simple 1D texture lookup */ def : Pat < (SIsample i32:$addr, v8i32:$rsrc, v4i32:$sampler, imm), (IMAGE_SAMPLE_V4_V1 $addr, $rsrc, $sampler, 0xf, 0, 0, 0, 0, 0, 0, 0) >; class SamplePattern : Pat < (name vt:$addr, v8i32:$rsrc, v4i32:$sampler, imm), (opcode $addr, $rsrc, $sampler, 0xf, 0, 0, 0, 0, 0, 0, 0) >; class SampleRectPattern : Pat < (name vt:$addr, v8i32:$rsrc, v4i32:$sampler, TEX_RECT), (opcode $addr, $rsrc, $sampler, 0xf, 1, 0, 0, 0, 0, 0, 0) >; class SampleArrayPattern : Pat < (name vt:$addr, v8i32:$rsrc, v4i32:$sampler, TEX_ARRAY), (opcode $addr, $rsrc, $sampler, 0xf, 0, 0, 0, 0, 0, 0, 1) >; class SampleShadowPattern : Pat < (name vt:$addr, v8i32:$rsrc, v4i32:$sampler, TEX_SHADOW), (opcode $addr, $rsrc, $sampler, 0xf, 0, 0, 0, 0, 0, 0, 0) >; class SampleShadowArrayPattern : Pat < (name vt:$addr, v8i32:$rsrc, v4i32:$sampler, TEX_SHADOW_ARRAY), (opcode $addr, $rsrc, $sampler, 0xf, 0, 0, 0, 0, 0, 0, 1) >; /* SIsample* for texture lookups consuming more address parameters */ multiclass SamplePatterns { def : SamplePattern ; def : SampleRectPattern ; def : SampleArrayPattern ; def : SampleShadowPattern ; def : SampleShadowArrayPattern ; def : SamplePattern ; def : SampleArrayPattern ; def : SampleShadowPattern ; def : SampleShadowArrayPattern ; def : SamplePattern ; def : SampleArrayPattern ; def : SampleShadowPattern ; def : SampleShadowArrayPattern ; def : SamplePattern ; def : SampleArrayPattern ; def : SampleShadowPattern ; def : SampleShadowArrayPattern ; } defm : SamplePatterns; defm : SamplePatterns; defm : SamplePatterns; defm : SamplePatterns; /********** ============================================ **********/ /********** Extraction, Insertion, Building and Casting **********/ /********** ============================================ **********/ foreach Index = 0-2 in { def Extract_Element_v2i32_#Index : Extract_Element < i32, v2i32, Index, !cast(sub#Index) >; def Insert_Element_v2i32_#Index : Insert_Element < i32, v2i32, Index, !cast(sub#Index) >; def Extract_Element_v2f32_#Index : Extract_Element < f32, v2f32, Index, !cast(sub#Index) >; def Insert_Element_v2f32_#Index : Insert_Element < f32, v2f32, Index, !cast(sub#Index) >; } foreach Index = 0-3 in { def Extract_Element_v4i32_#Index : Extract_Element < i32, v4i32, Index, !cast(sub#Index) >; def Insert_Element_v4i32_#Index : Insert_Element < i32, v4i32, Index, !cast(sub#Index) >; def Extract_Element_v4f32_#Index : Extract_Element < f32, v4f32, Index, !cast(sub#Index) >; def Insert_Element_v4f32_#Index : Insert_Element < f32, v4f32, Index, !cast(sub#Index) >; } foreach Index = 0-7 in { def Extract_Element_v8i32_#Index : Extract_Element < i32, v8i32, Index, !cast(sub#Index) >; def Insert_Element_v8i32_#Index : Insert_Element < i32, v8i32, Index, !cast(sub#Index) >; def Extract_Element_v8f32_#Index : Extract_Element < f32, v8f32, Index, !cast(sub#Index) >; def Insert_Element_v8f32_#Index : Insert_Element < f32, v8f32, Index, !cast(sub#Index) >; } foreach Index = 0-15 in { def Extract_Element_v16i32_#Index : Extract_Element < i32, v16i32, Index, !cast(sub#Index) >; def Insert_Element_v16i32_#Index : Insert_Element < i32, v16i32, Index, !cast(sub#Index) >; def Extract_Element_v16f32_#Index : Extract_Element < f32, v16f32, Index, !cast(sub#Index) >; def Insert_Element_v16f32_#Index : Insert_Element < f32, v16f32, Index, !cast(sub#Index) >; } // FIXME: Why do only some of these type combinations for SReg and // VReg? // 32-bit bitcast def : BitConvert ; def : BitConvert ; def : BitConvert ; def : BitConvert ; // 64-bit bitcast def : BitConvert ; def : BitConvert ; def : BitConvert ; def : BitConvert ; def : BitConvert ; def : BitConvert ; def : BitConvert ; def : BitConvert ; def : BitConvert ; def : BitConvert ; def : BitConvert ; def : BitConvert ; def : BitConvert ; def : BitConvert ; // 128-bit bitcast def : BitConvert ; def : BitConvert ; def : BitConvert ; def : BitConvert ; def : BitConvert ; def : BitConvert ; def : BitConvert ; def : BitConvert ; // 256-bit bitcast def : BitConvert ; def : BitConvert ; def : BitConvert ; def : BitConvert ; // 512-bit bitcast def : BitConvert ; def : BitConvert ; /********** =================== **********/ /********** Src & Dst modifiers **********/ /********** =================== **********/ def : Pat < (AMDGPUclamp (VOP3Mods0Clamp f32:$src0, i32:$src0_modifiers, i32:$omod), (f32 FP_ZERO), (f32 FP_ONE)), (V_ADD_F32_e64 $src0_modifiers, $src0, 0, 0, 1, $omod) >; /********** ================================ **********/ /********** Floating point absolute/negative **********/ /********** ================================ **********/ // Prevent expanding both fneg and fabs. def : Pat < (fneg (fabs f32:$src)), (S_OR_B32 $src, 0x80000000) // Set sign bit >; // FIXME: Should use S_OR_B32 def : Pat < (fneg (fabs f64:$src)), (REG_SEQUENCE VReg_64, (i32 (EXTRACT_SUBREG f64:$src, sub0)), sub0, (V_OR_B32_e32 (EXTRACT_SUBREG f64:$src, sub1), (V_MOV_B32_e32 0x80000000)), // Set sign bit. sub1) >; def : Pat < (fabs f32:$src), (V_AND_B32_e32 $src, (V_MOV_B32_e32 0x7fffffff)) >; def : Pat < (fneg f32:$src), (V_XOR_B32_e32 $src, (V_MOV_B32_e32 0x80000000)) >; def : Pat < (fabs f64:$src), (REG_SEQUENCE VReg_64, (i32 (EXTRACT_SUBREG f64:$src, sub0)), sub0, (V_AND_B32_e32 (EXTRACT_SUBREG f64:$src, sub1), (V_MOV_B32_e32 0x7fffffff)), // Set sign bit. sub1) >; def : Pat < (fneg f64:$src), (REG_SEQUENCE VReg_64, (i32 (EXTRACT_SUBREG f64:$src, sub0)), sub0, (V_XOR_B32_e32 (EXTRACT_SUBREG f64:$src, sub1), (V_MOV_B32_e32 0x80000000)), sub1) >; /********** ================== **********/ /********** Immediate Patterns **********/ /********** ================== **********/ def : Pat < (SGPRImm<(i32 imm)>:$imm), (S_MOV_B32 imm:$imm) >; def : Pat < (SGPRImm<(f32 fpimm)>:$imm), (S_MOV_B32 (f32 (bitcast_fpimm_to_i32 $imm))) >; def : Pat < (i32 imm:$imm), (V_MOV_B32_e32 imm:$imm) >; def : Pat < (f32 fpimm:$imm), (V_MOV_B32_e32 (f32 (bitcast_fpimm_to_i32 $imm))) >; def : Pat < (i64 InlineImm:$imm), (S_MOV_B64 InlineImm:$imm) >; // XXX - Should this use a s_cmp to set SCC? // Set to sign-extended 64-bit value (true = -1, false = 0) def : Pat < (i1 imm:$imm), (S_MOV_B64 (i64 (as_i64imm $imm))) >; def : Pat < (f64 InlineFPImm:$imm), (S_MOV_B64 (f64 (bitcast_fpimm_to_i64 InlineFPImm:$imm))) >; /********** ================== **********/ /********** Intrinsic Patterns **********/ /********** ================== **********/ def : POW_Common ; def : Pat < (int_AMDGPU_cube v4f32:$src), (REG_SEQUENCE VReg_128, (V_CUBETC_F32 0 /* src0_modifiers */, (EXTRACT_SUBREG $src, sub0), 0 /* src1_modifiers */, (EXTRACT_SUBREG $src, sub1), 0 /* src2_modifiers */, (EXTRACT_SUBREG $src, sub2), 0 /* clamp */, 0 /* omod */), sub0, (V_CUBESC_F32 0 /* src0_modifiers */, (EXTRACT_SUBREG $src, sub0), 0 /* src1_modifiers */,(EXTRACT_SUBREG $src, sub1), 0 /* src2_modifiers */,(EXTRACT_SUBREG $src, sub2), 0 /* clamp */, 0 /* omod */), sub1, (V_CUBEMA_F32 0 /* src1_modifiers */,(EXTRACT_SUBREG $src, sub0), 0 /* src1_modifiers */,(EXTRACT_SUBREG $src, sub1), 0 /* src1_modifiers */,(EXTRACT_SUBREG $src, sub2), 0 /* clamp */, 0 /* omod */), sub2, (V_CUBEID_F32 0 /* src1_modifiers */,(EXTRACT_SUBREG $src, sub0), 0 /* src1_modifiers */,(EXTRACT_SUBREG $src, sub1), 0 /* src1_modifiers */,(EXTRACT_SUBREG $src, sub2), 0 /* clamp */, 0 /* omod */), sub3) >; def : Pat < (i32 (sext i1:$src0)), (V_CNDMASK_B32_e64 (i32 0), (i32 -1), $src0) >; class Ext32Pat : Pat < (i32 (ext i1:$src0)), (V_CNDMASK_B32_e64 (i32 0), (i32 1), $src0) >; def : Ext32Pat ; def : Ext32Pat ; // Offset in an 32-bit VGPR def : Pat < (SIload_constant v4i32:$sbase, i32:$voff), (BUFFER_LOAD_DWORD_OFFEN $voff, $sbase, 0, 0, 0, 0, 0) >; // The multiplication scales from [0,1] to the unsigned integer range def : Pat < (AMDGPUurecip i32:$src0), (V_CVT_U32_F32_e32 (V_MUL_F32_e32 CONST.FP_UINT_MAX_PLUS_1, (V_RCP_IFLAG_F32_e32 (V_CVT_F32_U32_e32 $src0)))) >; //===----------------------------------------------------------------------===// // VOP3 Patterns //===----------------------------------------------------------------------===// def : IMad24Pat; def : UMad24Pat; defm : BFIPatterns ; def : ROTRPattern ; /********** ======================= **********/ /********** Load/Store Patterns **********/ /********** ======================= **********/ class DSReadPat : Pat < (vt (frag (DS1Addr1Offset i32:$ptr, i32:$offset))), (inst $ptr, (as_i16imm $offset), (i1 0)) >; def : DSReadPat ; def : DSReadPat ; def : DSReadPat ; def : DSReadPat ; def : DSReadPat ; let AddedComplexity = 100 in { def : DSReadPat ; } // End AddedComplexity = 100 def : Pat < (v2i32 (si_load_local (DS64Bit4ByteAligned i32:$ptr, i8:$offset0, i8:$offset1))), (DS_READ2_B32 $ptr, $offset0, $offset1, (i1 0)) >; class DSWritePat : Pat < (frag vt:$value, (DS1Addr1Offset i32:$ptr, i32:$offset)), (inst $ptr, $value, (as_i16imm $offset), (i1 0)) >; def : DSWritePat ; def : DSWritePat ; def : DSWritePat ; let AddedComplexity = 100 in { def : DSWritePat ; } // End AddedComplexity = 100 def : Pat < (si_store_local v2i32:$value, (DS64Bit4ByteAligned i32:$ptr, i8:$offset0, i8:$offset1)), (DS_WRITE2_B32 $ptr, (EXTRACT_SUBREG $value, sub0), (EXTRACT_SUBREG $value, sub1), $offset0, $offset1, (i1 0)) >; class DSAtomicRetPat : Pat < (frag (DS1Addr1Offset i32:$ptr, i32:$offset), vt:$value), (inst $ptr, $value, (as_i16imm $offset), (i1 0)) >; class DSAtomicCmpXChg : Pat < (frag (DS1Addr1Offset i32:$ptr, i32:$offset), vt:$cmp, vt:$swap), (inst $ptr, $cmp, $swap, (as_i16imm $offset), (i1 0)) >; // 32-bit atomics. def : DSAtomicRetPat; def : DSAtomicRetPat; def : DSAtomicRetPat; def : DSAtomicRetPat; def : DSAtomicRetPat; def : DSAtomicRetPat; def : DSAtomicRetPat; def : DSAtomicRetPat; def : DSAtomicRetPat; def : DSAtomicRetPat; def : DSAtomicRetPat; def : DSAtomicRetPat; def : DSAtomicCmpXChg; // 64-bit atomics. def : DSAtomicRetPat; def : DSAtomicRetPat; def : DSAtomicRetPat; def : DSAtomicRetPat; def : DSAtomicRetPat; def : DSAtomicRetPat; def : DSAtomicRetPat; def : DSAtomicRetPat; def : DSAtomicRetPat; def : DSAtomicRetPat; def : DSAtomicRetPat; def : DSAtomicRetPat; def : DSAtomicCmpXChg; //===----------------------------------------------------------------------===// // MUBUF Patterns //===----------------------------------------------------------------------===// class MUBUFLoad_Pattern : Pat < (vt (constant_ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i16:$offset, i1:$glc, i1:$slc, i1:$tfe))), (Instr_ADDR64 $vaddr, $srsrc, $soffset, $offset, $glc, $slc, $tfe) >; multiclass MUBUFLoad_Atomic_Pattern { def : Pat < (vt (atomic_ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i16:$offset, i1:$slc))), (Instr_ADDR64 $vaddr, $srsrc, $soffset, $offset, 1, $slc, 0) >; def : Pat < (vt (atomic_ld (MUBUFOffsetNoGLC v4i32:$rsrc, i32:$soffset, i16:$offset))), (Instr_OFFSET $rsrc, $soffset, (as_i16imm $offset), 1, 0, 0) >; } let Predicates = [isSICI] in { def : MUBUFLoad_Pattern ; def : MUBUFLoad_Pattern ; def : MUBUFLoad_Pattern ; def : MUBUFLoad_Pattern ; defm : MUBUFLoad_Atomic_Pattern ; defm : MUBUFLoad_Atomic_Pattern ; } // End Predicates = [isSICI] class MUBUFScratchLoadPat : Pat < (vt (ld (MUBUFScratch v4i32:$srsrc, i32:$vaddr, i32:$soffset, u16imm:$offset))), (Instr $vaddr, $srsrc, $soffset, $offset, 0, 0, 0) >; def : MUBUFScratchLoadPat ; def : MUBUFScratchLoadPat ; def : MUBUFScratchLoadPat ; def : MUBUFScratchLoadPat ; def : MUBUFScratchLoadPat ; def : MUBUFScratchLoadPat ; def : MUBUFScratchLoadPat ; // BUFFER_LOAD_DWORD*, addr64=0 multiclass MUBUF_Load_Dword { def : Pat < (vt (int_SI_buffer_load_dword v4i32:$rsrc, (i32 imm), i32:$soffset, imm:$offset, 0, 0, imm:$glc, imm:$slc, imm:$tfe)), (offset $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $glc), (as_i1imm $slc), (as_i1imm $tfe)) >; def : Pat < (vt (int_SI_buffer_load_dword v4i32:$rsrc, i32:$vaddr, i32:$soffset, imm:$offset, 1, 0, imm:$glc, imm:$slc, imm:$tfe)), (offen $vaddr, $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $glc), (as_i1imm $slc), (as_i1imm $tfe)) >; def : Pat < (vt (int_SI_buffer_load_dword v4i32:$rsrc, i32:$vaddr, i32:$soffset, imm:$offset, 0, 1, imm:$glc, imm:$slc, imm:$tfe)), (idxen $vaddr, $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $glc), (as_i1imm $slc), (as_i1imm $tfe)) >; def : Pat < (vt (int_SI_buffer_load_dword v4i32:$rsrc, v2i32:$vaddr, i32:$soffset, imm:$offset, 1, 1, imm:$glc, imm:$slc, imm:$tfe)), (bothen $vaddr, $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $glc), (as_i1imm $slc), (as_i1imm $tfe)) >; } defm : MUBUF_Load_Dword ; defm : MUBUF_Load_Dword ; defm : MUBUF_Load_Dword ; multiclass MUBUFStore_Atomic_Pattern { // Store follows atomic op convention so address is forst def : Pat < (atomic_st (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i16:$offset, i1:$slc), vt:$val), (Instr_ADDR64 $val, $vaddr, $srsrc, $soffset, $offset, 1, $slc, 0) >; def : Pat < (atomic_st (MUBUFOffsetNoGLC v4i32:$rsrc, i32:$soffset, i16:$offset), vt:$val), (Instr_OFFSET $val, $rsrc, $soffset, (as_i16imm $offset), 1, 0, 0) >; } let Predicates = [isSICI] in { defm : MUBUFStore_Atomic_Pattern ; defm : MUBUFStore_Atomic_Pattern ; } // End Predicates = [isSICI] class MUBUFScratchStorePat : Pat < (st vt:$value, (MUBUFScratch v4i32:$srsrc, i32:$vaddr, i32:$soffset, u16imm:$offset)), (Instr $value, $vaddr, $srsrc, $soffset, $offset, 0, 0, 0) >; def : MUBUFScratchStorePat ; def : MUBUFScratchStorePat ; def : MUBUFScratchStorePat ; def : MUBUFScratchStorePat ; def : MUBUFScratchStorePat ; //===----------------------------------------------------------------------===// // MTBUF Patterns //===----------------------------------------------------------------------===// // TBUFFER_STORE_FORMAT_*, addr64=0 class MTBUF_StoreResource : Pat< (SItbuffer_store v4i32:$rsrc, vt:$vdata, num_channels, i32:$vaddr, i32:$soffset, imm:$inst_offset, imm:$dfmt, imm:$nfmt, imm:$offen, imm:$idxen, imm:$glc, imm:$slc, imm:$tfe), (opcode $vdata, (as_i16imm $inst_offset), (as_i1imm $offen), (as_i1imm $idxen), (as_i1imm $glc), 0, (as_i8imm $dfmt), (as_i8imm $nfmt), $vaddr, $rsrc, (as_i1imm $slc), (as_i1imm $tfe), $soffset) >; def : MTBUF_StoreResource ; def : MTBUF_StoreResource ; def : MTBUF_StoreResource ; def : MTBUF_StoreResource ; /********** ====================== **********/ /********** Indirect adressing **********/ /********** ====================== **********/ multiclass SI_INDIRECT_Pattern { // Extract with offset def : Pat< (eltvt (extractelt vt:$src, (MOVRELOffset i32:$idx, (i32 imm:$offset)))), (!cast("SI_INDIRECT_SRC_"#VecSize) $src, $idx, imm:$offset) >; // Insert with offset def : Pat< (insertelt vt:$src, eltvt:$val, (MOVRELOffset i32:$idx, (i32 imm:$offset))), (!cast("SI_INDIRECT_DST_"#VecSize) $src, $idx, imm:$offset, $val) >; } defm : SI_INDIRECT_Pattern ; defm : SI_INDIRECT_Pattern ; defm : SI_INDIRECT_Pattern ; defm : SI_INDIRECT_Pattern ; defm : SI_INDIRECT_Pattern ; defm : SI_INDIRECT_Pattern ; defm : SI_INDIRECT_Pattern ; defm : SI_INDIRECT_Pattern ; //===----------------------------------------------------------------------===// // Conversion Patterns //===----------------------------------------------------------------------===// def : Pat<(i32 (sext_inreg i32:$src, i1)), (S_BFE_I32 i32:$src, 65536)>; // 0 | 1 << 16 // Handle sext_inreg in i64 def : Pat < (i64 (sext_inreg i64:$src, i1)), (S_BFE_I64 i64:$src, 0x10000) // 0 | 1 << 16 >; def : Pat < (i64 (sext_inreg i64:$src, i8)), (S_BFE_I64 i64:$src, 0x80000) // 0 | 8 << 16 >; def : Pat < (i64 (sext_inreg i64:$src, i16)), (S_BFE_I64 i64:$src, 0x100000) // 0 | 16 << 16 >; def : Pat < (i64 (sext_inreg i64:$src, i32)), (S_BFE_I64 i64:$src, 0x200000) // 0 | 32 << 16 >; class ZExt_i64_i32_Pat : Pat < (i64 (ext i32:$src)), (REG_SEQUENCE SReg_64, $src, sub0, (S_MOV_B32 0), sub1) >; class ZExt_i64_i1_Pat : Pat < (i64 (ext i1:$src)), (REG_SEQUENCE VReg_64, (V_CNDMASK_B32_e64 (i32 0), (i32 1), $src), sub0, (S_MOV_B32 0), sub1) >; def : ZExt_i64_i32_Pat; def : ZExt_i64_i32_Pat; def : ZExt_i64_i1_Pat; def : ZExt_i64_i1_Pat; // FIXME: We need to use COPY_TO_REGCLASS to work-around the fact that // REG_SEQUENCE patterns don't support instructions with multiple outputs. def : Pat < (i64 (sext i32:$src)), (REG_SEQUENCE SReg_64, $src, sub0, (i32 (COPY_TO_REGCLASS (S_ASHR_I32 $src, 31), SReg_32_XM0)), sub1) >; def : Pat < (i64 (sext i1:$src)), (REG_SEQUENCE VReg_64, (V_CNDMASK_B32_e64 0, -1, $src), sub0, (V_CNDMASK_B32_e64 0, -1, $src), sub1) >; class FPToI1Pat : Pat < (i1 (fp_to_int (vt (VOP3Mods vt:$src0, i32:$src0_modifiers)))), (i1 (Inst 0, KOne, $src0_modifiers, $src0, DSTCLAMP.NONE, DSTOMOD.NONE)) >; def : FPToI1Pat; def : FPToI1Pat; def : FPToI1Pat; def : FPToI1Pat; // If we need to perform a logical operation on i1 values, we need to // use vector comparisons since there is only one SCC register. Vector // comparisions still write to a pair of SGPRs, so treat these as // 64-bit comparisons. When legalizing SGPR copies, instructions // resulting in the copies from SCC to these instructions will be // moved to the VALU. def : Pat < (i1 (and i1:$src0, i1:$src1)), (S_AND_B64 $src0, $src1) >; def : Pat < (i1 (or i1:$src0, i1:$src1)), (S_OR_B64 $src0, $src1) >; def : Pat < (i1 (xor i1:$src0, i1:$src1)), (S_XOR_B64 $src0, $src1) >; def : Pat < (f32 (sint_to_fp i1:$src)), (V_CNDMASK_B32_e64 (i32 0), CONST.FP32_NEG_ONE, $src) >; def : Pat < (f32 (uint_to_fp i1:$src)), (V_CNDMASK_B32_e64 (i32 0), CONST.FP32_ONE, $src) >; def : Pat < (f64 (sint_to_fp i1:$src)), (V_CVT_F64_I32_e32 (V_CNDMASK_B32_e64 (i32 0), (i32 -1), $src)) >; def : Pat < (f64 (uint_to_fp i1:$src)), (V_CVT_F64_U32_e32 (V_CNDMASK_B32_e64 (i32 0), (i32 1), $src)) >; //===----------------------------------------------------------------------===// // Miscellaneous Patterns //===----------------------------------------------------------------------===// def : Pat < (i32 (trunc i64:$a)), (EXTRACT_SUBREG $a, sub0) >; def : Pat < (i1 (trunc i32:$a)), (V_CMP_EQ_I32_e64 (S_AND_B32 (i32 1), $a), 1) >; def : Pat < (i1 (trunc i64:$a)), (V_CMP_EQ_I32_e64 (S_AND_B32 (i32 1), (EXTRACT_SUBREG $a, sub0)), 1) >; def : Pat < (i32 (bswap i32:$a)), (V_BFI_B32 (S_MOV_B32 0x00ff00ff), (V_ALIGNBIT_B32 $a, $a, 24), (V_ALIGNBIT_B32 $a, $a, 8)) >; def : Pat < (f32 (select i1:$src2, f32:$src1, f32:$src0)), (V_CNDMASK_B32_e64 $src0, $src1, $src2) >; multiclass BFMPatterns { def : Pat < (vt (shl (vt (add (vt (shl 1, vt:$a)), -1)), vt:$b)), (BFM $a, $b) >; def : Pat < (vt (add (vt (shl 1, vt:$a)), -1)), (BFM $a, (MOV 0)) >; } defm : BFMPatterns ; // FIXME: defm : BFMPatterns ; def : BFEPattern ; let Predicates = [isSICI] in { def : Pat < (i64 (readcyclecounter)), (S_MEMTIME) >; } def : Pat< (fcanonicalize f32:$src), (V_MUL_F32_e64 0, CONST.FP32_ONE, 0, $src, 0, 0) >; def : Pat< (fcanonicalize f64:$src), (V_MUL_F64 0, CONST.FP64_ONE, 0, $src, 0, 0) >; //===----------------------------------------------------------------------===// // Fract Patterns //===----------------------------------------------------------------------===// let Predicates = [isSI] in { // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) is // used instead. However, SI doesn't have V_FLOOR_F64, so the most efficient // way to implement it is using V_FRACT_F64. // The workaround for the V_FRACT bug is: // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999) // Convert floor(x) to (x - fract(x)) def : Pat < (f64 (ffloor (f64 (VOP3Mods f64:$x, i32:$mods)))), (V_ADD_F64 $mods, $x, SRCMODS.NEG, (V_CNDMASK_B64_PSEUDO (V_MIN_F64 SRCMODS.NONE, (V_FRACT_F64_e64 $mods, $x, DSTCLAMP.NONE, DSTOMOD.NONE), SRCMODS.NONE, (V_MOV_B64_PSEUDO 0x3fefffffffffffff), DSTCLAMP.NONE, DSTOMOD.NONE), $x, (V_CMP_CLASS_F64_e64 SRCMODS.NONE, $x, 3/*NaN*/)), DSTCLAMP.NONE, DSTOMOD.NONE) >; } // End Predicates = [isSI] //============================================================================// // Miscellaneous Optimization Patterns //============================================================================// def : SHA256MaPattern ; def : IntMed3Pat; def : IntMed3Pat; //============================================================================// // Assembler aliases //============================================================================// def : MnemonicAlias<"v_add_u32", "v_add_i32">; def : MnemonicAlias<"v_sub_u32", "v_sub_i32">; def : MnemonicAlias<"v_subrev_u32", "v_subrev_i32">; } // End isGCN predicate Index: projects/clang391-import/contrib/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp =================================================================== --- projects/clang391-import/contrib/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp (revision 309436) +++ projects/clang391-import/contrib/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp (revision 309437) @@ -1,537 +1,530 @@ //===-- SIWholeQuadMode.cpp - enter and suspend whole quad mode -----------===// // // The LLVM Compiler Infrastructure // // This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// // /// \file /// \brief This pass adds instructions to enable whole quad mode for pixel /// shaders. /// /// Whole quad mode is required for derivative computations, but it interferes /// with shader side effects (stores and atomics). This pass is run on the /// scheduled machine IR but before register coalescing, so that machine SSA is /// available for analysis. It ensures that WQM is enabled when necessary, but /// disabled around stores and atomics. /// /// When necessary, this pass creates a function prolog /// /// S_MOV_B64 LiveMask, EXEC /// S_WQM_B64 EXEC, EXEC /// /// to enter WQM at the top of the function and surrounds blocks of Exact /// instructions by /// /// S_AND_SAVEEXEC_B64 Tmp, LiveMask /// ... /// S_MOV_B64 EXEC, Tmp /// /// In order to avoid excessive switching during sequences of Exact /// instructions, the pass first analyzes which instructions must be run in WQM /// (aka which instructions produce values that lead to derivative /// computations). /// /// Basic blocks are always exited in WQM as long as some successor needs WQM. /// /// There is room for improvement given better control flow analysis: /// /// (1) at the top level (outside of control flow statements, and as long as /// kill hasn't been used), one SGPR can be saved by recovering WQM from /// the LiveMask (this is implemented for the entry block). /// /// (2) when entire regions (e.g. if-else blocks or entire loops) only /// consist of exact and don't-care instructions, the switch only has to /// be done at the entry and exit points rather than potentially in each /// block of the region. /// //===----------------------------------------------------------------------===// #include "AMDGPU.h" #include "AMDGPUSubtarget.h" #include "SIInstrInfo.h" #include "SIMachineFunctionInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" using namespace llvm; #define DEBUG_TYPE "si-wqm" namespace { enum { StateWQM = 0x1, StateExact = 0x2, }; struct InstrInfo { char Needs = 0; char OutNeeds = 0; }; struct BlockInfo { char Needs = 0; char InNeeds = 0; char OutNeeds = 0; }; struct WorkItem { MachineBasicBlock *MBB = nullptr; MachineInstr *MI = nullptr; WorkItem() {} WorkItem(MachineBasicBlock *MBB) : MBB(MBB) {} WorkItem(MachineInstr *MI) : MI(MI) {} }; class SIWholeQuadMode : public MachineFunctionPass { private: const SIInstrInfo *TII; const SIRegisterInfo *TRI; MachineRegisterInfo *MRI; LiveIntervals *LIS; DenseMap Instructions; DenseMap Blocks; SmallVector ExecExports; SmallVector LiveMaskQueries; void markInstruction(MachineInstr &MI, char Flag, std::vector &Worklist); char scanInstructions(MachineFunction &MF, std::vector &Worklist); void propagateInstruction(MachineInstr &MI, std::vector &Worklist); void propagateBlock(MachineBasicBlock &MBB, std::vector &Worklist); char analyzeFunction(MachineFunction &MF); void toExact(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before, unsigned SaveWQM, unsigned LiveMaskReg); void toWQM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before, unsigned SavedWQM); void processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg, bool isEntry); void lowerLiveMaskQueries(unsigned LiveMaskReg); public: static char ID; SIWholeQuadMode() : MachineFunctionPass(ID) { } bool runOnMachineFunction(MachineFunction &MF) override; const char *getPassName() const override { return "SI Whole Quad Mode"; } void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired(); AU.setPreservesCFG(); MachineFunctionPass::getAnalysisUsage(AU); } }; } // End anonymous namespace char SIWholeQuadMode::ID = 0; INITIALIZE_PASS_BEGIN(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false, false) INITIALIZE_PASS_DEPENDENCY(LiveIntervals) INITIALIZE_PASS_END(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false, false) char &llvm::SIWholeQuadModeID = SIWholeQuadMode::ID; FunctionPass *llvm::createSIWholeQuadModePass() { return new SIWholeQuadMode; } void SIWholeQuadMode::markInstruction(MachineInstr &MI, char Flag, std::vector &Worklist) { InstrInfo &II = Instructions[&MI]; assert(Flag == StateWQM || Flag == StateExact); // Ignore if the instruction is already marked. The typical case is that we // mark an instruction WQM multiple times, but for atomics it can happen that // Flag is StateWQM, but Needs is already set to StateExact. In this case, // letting the atomic run in StateExact is correct as per the relevant specs. if (II.Needs) return; II.Needs = Flag; Worklist.push_back(&MI); } // Scan instructions to determine which ones require an Exact execmask and // which ones seed WQM requirements. char SIWholeQuadMode::scanInstructions(MachineFunction &MF, std::vector &Worklist) { char GlobalFlags = 0; bool WQMOutputs = MF.getFunction()->hasFnAttribute("amdgpu-ps-wqm-outputs"); for (auto BI = MF.begin(), BE = MF.end(); BI != BE; ++BI) { MachineBasicBlock &MBB = *BI; for (auto II = MBB.begin(), IE = MBB.end(); II != IE; ++II) { MachineInstr &MI = *II; unsigned Opcode = MI.getOpcode(); char Flags = 0; if (TII->isWQM(Opcode) || TII->isDS(Opcode)) { Flags = StateWQM; } else if (TII->isDisableWQM(MI)) { Flags = StateExact; } else { // Handle export instructions with the exec mask valid flag set if (Opcode == AMDGPU::EXP) { if (MI.getOperand(4).getImm() != 0) ExecExports.push_back(&MI); } else if (Opcode == AMDGPU::SI_PS_LIVE) { LiveMaskQueries.push_back(&MI); } else if (WQMOutputs) { // The function is in machine SSA form, which means that physical // VGPRs correspond to shader inputs and outputs. Inputs are // only used, outputs are only defined. for (const MachineOperand &MO : MI.defs()) { if (!MO.isReg()) continue; unsigned Reg = MO.getReg(); if (!TRI->isVirtualRegister(Reg) && TRI->hasVGPRs(TRI->getPhysRegClass(Reg))) { Flags = StateWQM; break; } } } if (!Flags) continue; } markInstruction(MI, Flags, Worklist); GlobalFlags |= Flags; } - - if (WQMOutputs && MBB.succ_empty()) { - // This is a prolog shader. Make sure we go back to exact mode at the end. - Blocks[&MBB].OutNeeds = StateExact; - Worklist.push_back(&MBB); - GlobalFlags |= StateExact; - } } return GlobalFlags; } void SIWholeQuadMode::propagateInstruction(MachineInstr &MI, std::vector& Worklist) { MachineBasicBlock *MBB = MI.getParent(); InstrInfo II = Instructions[&MI]; // take a copy to prevent dangling references BlockInfo &BI = Blocks[MBB]; // Control flow-type instructions and stores to temporary memory that are // followed by WQM computations must themselves be in WQM. if ((II.OutNeeds & StateWQM) && !II.Needs && (MI.isTerminator() || (TII->usesVM_CNT(MI) && MI.mayStore()))) { Instructions[&MI].Needs = StateWQM; II.Needs = StateWQM; } // Propagate to block level BI.Needs |= II.Needs; if ((BI.InNeeds | II.Needs) != BI.InNeeds) { BI.InNeeds |= II.Needs; Worklist.push_back(MBB); } // Propagate backwards within block if (MachineInstr *PrevMI = MI.getPrevNode()) { char InNeeds = II.Needs | II.OutNeeds; if (!PrevMI->isPHI()) { InstrInfo &PrevII = Instructions[PrevMI]; if ((PrevII.OutNeeds | InNeeds) != PrevII.OutNeeds) { PrevII.OutNeeds |= InNeeds; Worklist.push_back(PrevMI); } } } // Propagate WQM flag to instruction inputs assert(II.Needs != (StateWQM | StateExact)); if (II.Needs != StateWQM) return; for (const MachineOperand &Use : MI.uses()) { if (!Use.isReg() || !Use.isUse()) continue; unsigned Reg = Use.getReg(); // Handle physical registers that we need to track; this is mostly relevant // for VCC, which can appear as the (implicit) input of a uniform branch, // e.g. when a loop counter is stored in a VGPR. if (!TargetRegisterInfo::isVirtualRegister(Reg)) { if (Reg == AMDGPU::EXEC) continue; for (MCRegUnitIterator RegUnit(Reg, TRI); RegUnit.isValid(); ++RegUnit) { LiveRange &LR = LIS->getRegUnit(*RegUnit); const VNInfo *Value = LR.Query(LIS->getInstructionIndex(MI)).valueIn(); if (!Value) continue; // Since we're in machine SSA, we do not need to track physical // registers across basic blocks. if (Value->isPHIDef()) continue; markInstruction(*LIS->getInstructionFromIndex(Value->def), StateWQM, Worklist); } continue; } for (MachineInstr &DefMI : MRI->def_instructions(Use.getReg())) markInstruction(DefMI, StateWQM, Worklist); } } void SIWholeQuadMode::propagateBlock(MachineBasicBlock &MBB, std::vector& Worklist) { BlockInfo BI = Blocks[&MBB]; // Make a copy to prevent dangling references. // Propagate through instructions if (!MBB.empty()) { MachineInstr *LastMI = &*MBB.rbegin(); InstrInfo &LastII = Instructions[LastMI]; if ((LastII.OutNeeds | BI.OutNeeds) != LastII.OutNeeds) { LastII.OutNeeds |= BI.OutNeeds; Worklist.push_back(LastMI); } } // Predecessor blocks must provide for our WQM/Exact needs. for (MachineBasicBlock *Pred : MBB.predecessors()) { BlockInfo &PredBI = Blocks[Pred]; if ((PredBI.OutNeeds | BI.InNeeds) == PredBI.OutNeeds) continue; PredBI.OutNeeds |= BI.InNeeds; PredBI.InNeeds |= BI.InNeeds; Worklist.push_back(Pred); } // All successors must be prepared to accept the same set of WQM/Exact data. for (MachineBasicBlock *Succ : MBB.successors()) { BlockInfo &SuccBI = Blocks[Succ]; if ((SuccBI.InNeeds | BI.OutNeeds) == SuccBI.InNeeds) continue; SuccBI.InNeeds |= BI.OutNeeds; Worklist.push_back(Succ); } } char SIWholeQuadMode::analyzeFunction(MachineFunction &MF) { std::vector Worklist; char GlobalFlags = scanInstructions(MF, Worklist); while (!Worklist.empty()) { WorkItem WI = Worklist.back(); Worklist.pop_back(); if (WI.MI) propagateInstruction(*WI.MI, Worklist); else propagateBlock(*WI.MBB, Worklist); } return GlobalFlags; } void SIWholeQuadMode::toExact(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before, unsigned SaveWQM, unsigned LiveMaskReg) { if (SaveWQM) { BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_AND_SAVEEXEC_B64), SaveWQM) .addReg(LiveMaskReg); } else { BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_AND_B64), AMDGPU::EXEC) .addReg(AMDGPU::EXEC) .addReg(LiveMaskReg); } } void SIWholeQuadMode::toWQM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before, unsigned SavedWQM) { if (SavedWQM) { BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), AMDGPU::EXEC) .addReg(SavedWQM); } else { BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_WQM_B64), AMDGPU::EXEC) .addReg(AMDGPU::EXEC); } } void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg, bool isEntry) { auto BII = Blocks.find(&MBB); if (BII == Blocks.end()) return; const BlockInfo &BI = BII->second; if (!(BI.InNeeds & StateWQM)) return; // This is a non-entry block that is WQM throughout, so no need to do // anything. if (!isEntry && !(BI.Needs & StateExact) && BI.OutNeeds != StateExact) return; unsigned SavedWQMReg = 0; bool WQMFromExec = isEntry; char State = isEntry ? StateExact : StateWQM; auto II = MBB.getFirstNonPHI(), IE = MBB.end(); while (II != IE) { MachineInstr &MI = *II; ++II; // Skip instructions that are not affected by EXEC if (TII->isScalarUnit(MI) && !MI.isTerminator()) continue; // Generic instructions such as COPY will either disappear by register // coalescing or be lowered to SALU or VALU instructions. if (TargetInstrInfo::isGenericOpcode(MI.getOpcode())) { if (MI.getNumExplicitOperands() >= 1) { const MachineOperand &Op = MI.getOperand(0); if (Op.isReg()) { if (TRI->isSGPRReg(*MRI, Op.getReg())) { // SGPR instructions are not affected by EXEC continue; } } } } char Needs = 0; char OutNeeds = 0; auto InstrInfoIt = Instructions.find(&MI); if (InstrInfoIt != Instructions.end()) { Needs = InstrInfoIt->second.Needs; OutNeeds = InstrInfoIt->second.OutNeeds; // Make sure to switch to Exact mode before the end of the block when // Exact and only Exact is needed further downstream. if (OutNeeds == StateExact && MI.isTerminator()) { assert(Needs == 0); Needs = StateExact; } } // State switching if (Needs && State != Needs) { if (Needs == StateExact) { assert(!SavedWQMReg); if (!WQMFromExec && (OutNeeds & StateWQM)) SavedWQMReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass); toExact(MBB, &MI, SavedWQMReg, LiveMaskReg); } else { assert(WQMFromExec == (SavedWQMReg == 0)); toWQM(MBB, &MI, SavedWQMReg); SavedWQMReg = 0; } State = Needs; } } if ((BI.OutNeeds & StateWQM) && State != StateWQM) { assert(WQMFromExec == (SavedWQMReg == 0)); toWQM(MBB, MBB.end(), SavedWQMReg); } else if (BI.OutNeeds == StateExact && State != StateExact) { toExact(MBB, MBB.end(), 0, LiveMaskReg); } } void SIWholeQuadMode::lowerLiveMaskQueries(unsigned LiveMaskReg) { for (MachineInstr *MI : LiveMaskQueries) { const DebugLoc &DL = MI->getDebugLoc(); unsigned Dest = MI->getOperand(0).getReg(); BuildMI(*MI->getParent(), MI, DL, TII->get(AMDGPU::COPY), Dest) .addReg(LiveMaskReg); MI->eraseFromParent(); } } bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) { if (MF.getFunction()->getCallingConv() != CallingConv::AMDGPU_PS) return false; Instructions.clear(); Blocks.clear(); ExecExports.clear(); LiveMaskQueries.clear(); const SISubtarget &ST = MF.getSubtarget(); TII = ST.getInstrInfo(); TRI = &TII->getRegisterInfo(); MRI = &MF.getRegInfo(); LIS = &getAnalysis(); char GlobalFlags = analyzeFunction(MF); if (!(GlobalFlags & StateWQM)) { lowerLiveMaskQueries(AMDGPU::EXEC); return !LiveMaskQueries.empty(); } // Store a copy of the original live mask when required unsigned LiveMaskReg = 0; { MachineBasicBlock &Entry = MF.front(); MachineBasicBlock::iterator EntryMI = Entry.getFirstNonPHI(); if (GlobalFlags & StateExact || !LiveMaskQueries.empty()) { LiveMaskReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass); BuildMI(Entry, EntryMI, DebugLoc(), TII->get(AMDGPU::COPY), LiveMaskReg) .addReg(AMDGPU::EXEC); } if (GlobalFlags == StateWQM) { // For a shader that needs only WQM, we can just set it once. BuildMI(Entry, EntryMI, DebugLoc(), TII->get(AMDGPU::S_WQM_B64), AMDGPU::EXEC) .addReg(AMDGPU::EXEC); lowerLiveMaskQueries(LiveMaskReg); // EntryMI may become invalid here return true; } } lowerLiveMaskQueries(LiveMaskReg); // Handle the general case for (auto BII : Blocks) processBlock(*BII.first, LiveMaskReg, BII.first == &*MF.begin()); return true; } Index: projects/clang391-import/contrib/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp =================================================================== --- projects/clang391-import/contrib/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp (revision 309436) +++ projects/clang391-import/contrib/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp (revision 309437) @@ -1,4715 +1,4715 @@ //===- InstCombineCompares.cpp --------------------------------------------===// // // The LLVM Compiler Infrastructure // // This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// // // This file implements the visitICmp and visitFCmp functions. // //===----------------------------------------------------------------------===// #include "InstCombineInternal.h" #include "llvm/ADT/APSInt.h" #include "llvm/ADT/SetVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/ConstantFolding.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/MemoryBuiltins.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/VectorUtils.h" #include "llvm/IR/ConstantRange.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/GetElementPtrTypeIterator.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/PatternMatch.h" #include "llvm/Support/Debug.h" using namespace llvm; using namespace PatternMatch; #define DEBUG_TYPE "instcombine" // How many times is a select replaced by one of its operands? STATISTIC(NumSel, "Number of select opts"); // Initialization Routines static ConstantInt *getOne(Constant *C) { return ConstantInt::get(cast(C->getType()), 1); } static ConstantInt *ExtractElement(Constant *V, Constant *Idx) { return cast(ConstantExpr::getExtractElement(V, Idx)); } static bool HasAddOverflow(ConstantInt *Result, ConstantInt *In1, ConstantInt *In2, bool IsSigned) { if (!IsSigned) return Result->getValue().ult(In1->getValue()); if (In2->isNegative()) return Result->getValue().sgt(In1->getValue()); return Result->getValue().slt(In1->getValue()); } /// Compute Result = In1+In2, returning true if the result overflowed for this /// type. static bool AddWithOverflow(Constant *&Result, Constant *In1, Constant *In2, bool IsSigned = false) { Result = ConstantExpr::getAdd(In1, In2); if (VectorType *VTy = dyn_cast(In1->getType())) { for (unsigned i = 0, e = VTy->getNumElements(); i != e; ++i) { Constant *Idx = ConstantInt::get(Type::getInt32Ty(In1->getContext()), i); if (HasAddOverflow(ExtractElement(Result, Idx), ExtractElement(In1, Idx), ExtractElement(In2, Idx), IsSigned)) return true; } return false; } return HasAddOverflow(cast(Result), cast(In1), cast(In2), IsSigned); } static bool HasSubOverflow(ConstantInt *Result, ConstantInt *In1, ConstantInt *In2, bool IsSigned) { if (!IsSigned) return Result->getValue().ugt(In1->getValue()); if (In2->isNegative()) return Result->getValue().slt(In1->getValue()); return Result->getValue().sgt(In1->getValue()); } /// Compute Result = In1-In2, returning true if the result overflowed for this /// type. static bool SubWithOverflow(Constant *&Result, Constant *In1, Constant *In2, bool IsSigned = false) { Result = ConstantExpr::getSub(In1, In2); if (VectorType *VTy = dyn_cast(In1->getType())) { for (unsigned i = 0, e = VTy->getNumElements(); i != e; ++i) { Constant *Idx = ConstantInt::get(Type::getInt32Ty(In1->getContext()), i); if (HasSubOverflow(ExtractElement(Result, Idx), ExtractElement(In1, Idx), ExtractElement(In2, Idx), IsSigned)) return true; } return false; } return HasSubOverflow(cast(Result), cast(In1), cast(In2), IsSigned); } /// Given an icmp instruction, return true if any use of this comparison is a /// branch on sign bit comparison. static bool isBranchOnSignBitCheck(ICmpInst &I, bool isSignBit) { for (auto *U : I.users()) if (isa(U)) return isSignBit; return false; } /// Given an exploded icmp instruction, return true if the comparison only /// checks the sign bit. If it only checks the sign bit, set TrueIfSigned if the /// result of the comparison is true when the input value is signed. static bool isSignBitCheck(ICmpInst::Predicate Pred, ConstantInt *RHS, bool &TrueIfSigned) { switch (Pred) { case ICmpInst::ICMP_SLT: // True if LHS s< 0 TrueIfSigned = true; return RHS->isZero(); case ICmpInst::ICMP_SLE: // True if LHS s<= RHS and RHS == -1 TrueIfSigned = true; return RHS->isAllOnesValue(); case ICmpInst::ICMP_SGT: // True if LHS s> -1 TrueIfSigned = false; return RHS->isAllOnesValue(); case ICmpInst::ICMP_UGT: // True if LHS u> RHS and RHS == high-bit-mask - 1 TrueIfSigned = true; return RHS->isMaxValue(true); case ICmpInst::ICMP_UGE: // True if LHS u>= RHS and RHS == high-bit-mask (2^7, 2^15, 2^31, etc) TrueIfSigned = true; return RHS->getValue().isSignBit(); default: return false; } } /// Returns true if the exploded icmp can be expressed as a signed comparison /// to zero and updates the predicate accordingly. /// The signedness of the comparison is preserved. static bool isSignTest(ICmpInst::Predicate &Pred, const ConstantInt *RHS) { if (!ICmpInst::isSigned(Pred)) return false; if (RHS->isZero()) return ICmpInst::isRelational(Pred); if (RHS->isOne()) { if (Pred == ICmpInst::ICMP_SLT) { Pred = ICmpInst::ICMP_SLE; return true; } } else if (RHS->isAllOnesValue()) { if (Pred == ICmpInst::ICMP_SGT) { Pred = ICmpInst::ICMP_SGE; return true; } } return false; } /// Return true if the constant is of the form 1+0+. This is the same as /// lowones(~X). static bool isHighOnes(const ConstantInt *CI) { return (~CI->getValue() + 1).isPowerOf2(); } /// Given a signed integer type and a set of known zero and one bits, compute /// the maximum and minimum values that could have the specified known zero and /// known one bits, returning them in Min/Max. static void ComputeSignedMinMaxValuesFromKnownBits(const APInt &KnownZero, const APInt &KnownOne, APInt &Min, APInt &Max) { assert(KnownZero.getBitWidth() == KnownOne.getBitWidth() && KnownZero.getBitWidth() == Min.getBitWidth() && KnownZero.getBitWidth() == Max.getBitWidth() && "KnownZero, KnownOne and Min, Max must have equal bitwidth."); APInt UnknownBits = ~(KnownZero|KnownOne); // The minimum value is when all unknown bits are zeros, EXCEPT for the sign // bit if it is unknown. Min = KnownOne; Max = KnownOne|UnknownBits; if (UnknownBits.isNegative()) { // Sign bit is unknown Min.setBit(Min.getBitWidth()-1); Max.clearBit(Max.getBitWidth()-1); } } /// Given an unsigned integer type and a set of known zero and one bits, compute /// the maximum and minimum values that could have the specified known zero and /// known one bits, returning them in Min/Max. static void ComputeUnsignedMinMaxValuesFromKnownBits(const APInt &KnownZero, const APInt &KnownOne, APInt &Min, APInt &Max) { assert(KnownZero.getBitWidth() == KnownOne.getBitWidth() && KnownZero.getBitWidth() == Min.getBitWidth() && KnownZero.getBitWidth() == Max.getBitWidth() && "Ty, KnownZero, KnownOne and Min, Max must have equal bitwidth."); APInt UnknownBits = ~(KnownZero|KnownOne); // The minimum value is when the unknown bits are all zeros. Min = KnownOne; // The maximum value is when the unknown bits are all ones. Max = KnownOne|UnknownBits; } /// This is called when we see this pattern: /// cmp pred (load (gep GV, ...)), cmpcst /// where GV is a global variable with a constant initializer. Try to simplify /// this into some simple computation that does not need the load. For example /// we can optimize "icmp eq (load (gep "foo", 0, i)), 0" into "icmp eq i, 3". /// /// If AndCst is non-null, then the loaded value is masked with that constant /// before doing the comparison. This handles cases like "A[i]&4 == 0". Instruction *InstCombiner:: FoldCmpLoadFromIndexedGlobal(GetElementPtrInst *GEP, GlobalVariable *GV, CmpInst &ICI, ConstantInt *AndCst) { Constant *Init = GV->getInitializer(); if (!isa(Init) && !isa(Init)) return nullptr; uint64_t ArrayElementCount = Init->getType()->getArrayNumElements(); if (ArrayElementCount > 1024) return nullptr; // Don't blow up on huge arrays. // There are many forms of this optimization we can handle, for now, just do // the simple index into a single-dimensional array. // // Require: GEP GV, 0, i {{, constant indices}} if (GEP->getNumOperands() < 3 || !isa(GEP->getOperand(1)) || !cast(GEP->getOperand(1))->isZero() || isa(GEP->getOperand(2))) return nullptr; // Check that indices after the variable are constants and in-range for the // type they index. Collect the indices. This is typically for arrays of // structs. SmallVector LaterIndices; Type *EltTy = Init->getType()->getArrayElementType(); for (unsigned i = 3, e = GEP->getNumOperands(); i != e; ++i) { ConstantInt *Idx = dyn_cast(GEP->getOperand(i)); if (!Idx) return nullptr; // Variable index. uint64_t IdxVal = Idx->getZExtValue(); if ((unsigned)IdxVal != IdxVal) return nullptr; // Too large array index. if (StructType *STy = dyn_cast(EltTy)) EltTy = STy->getElementType(IdxVal); else if (ArrayType *ATy = dyn_cast(EltTy)) { if (IdxVal >= ATy->getNumElements()) return nullptr; EltTy = ATy->getElementType(); } else { return nullptr; // Unknown type. } LaterIndices.push_back(IdxVal); } enum { Overdefined = -3, Undefined = -2 }; // Variables for our state machines. // FirstTrueElement/SecondTrueElement - Used to emit a comparison of the form // "i == 47 | i == 87", where 47 is the first index the condition is true for, // and 87 is the second (and last) index. FirstTrueElement is -2 when // undefined, otherwise set to the first true element. SecondTrueElement is // -2 when undefined, -3 when overdefined and >= 0 when that index is true. int FirstTrueElement = Undefined, SecondTrueElement = Undefined; // FirstFalseElement/SecondFalseElement - Used to emit a comparison of the // form "i != 47 & i != 87". Same state transitions as for true elements. int FirstFalseElement = Undefined, SecondFalseElement = Undefined; /// TrueRangeEnd/FalseRangeEnd - In conjunction with First*Element, these /// define a state machine that triggers for ranges of values that the index /// is true or false for. This triggers on things like "abbbbc"[i] == 'b'. /// This is -2 when undefined, -3 when overdefined, and otherwise the last /// index in the range (inclusive). We use -2 for undefined here because we /// use relative comparisons and don't want 0-1 to match -1. int TrueRangeEnd = Undefined, FalseRangeEnd = Undefined; // MagicBitvector - This is a magic bitvector where we set a bit if the // comparison is true for element 'i'. If there are 64 elements or less in // the array, this will fully represent all the comparison results. uint64_t MagicBitvector = 0; // Scan the array and see if one of our patterns matches. Constant *CompareRHS = cast(ICI.getOperand(1)); for (unsigned i = 0, e = ArrayElementCount; i != e; ++i) { Constant *Elt = Init->getAggregateElement(i); if (!Elt) return nullptr; // If this is indexing an array of structures, get the structure element. if (!LaterIndices.empty()) Elt = ConstantExpr::getExtractValue(Elt, LaterIndices); // If the element is masked, handle it. if (AndCst) Elt = ConstantExpr::getAnd(Elt, AndCst); // Find out if the comparison would be true or false for the i'th element. Constant *C = ConstantFoldCompareInstOperands(ICI.getPredicate(), Elt, CompareRHS, DL, TLI); // If the result is undef for this element, ignore it. if (isa(C)) { // Extend range state machines to cover this element in case there is an // undef in the middle of the range. if (TrueRangeEnd == (int)i-1) TrueRangeEnd = i; if (FalseRangeEnd == (int)i-1) FalseRangeEnd = i; continue; } // If we can't compute the result for any of the elements, we have to give // up evaluating the entire conditional. if (!isa(C)) return nullptr; // Otherwise, we know if the comparison is true or false for this element, // update our state machines. bool IsTrueForElt = !cast(C)->isZero(); // State machine for single/double/range index comparison. if (IsTrueForElt) { // Update the TrueElement state machine. if (FirstTrueElement == Undefined) FirstTrueElement = TrueRangeEnd = i; // First true element. else { // Update double-compare state machine. if (SecondTrueElement == Undefined) SecondTrueElement = i; else SecondTrueElement = Overdefined; // Update range state machine. if (TrueRangeEnd == (int)i-1) TrueRangeEnd = i; else TrueRangeEnd = Overdefined; } } else { // Update the FalseElement state machine. if (FirstFalseElement == Undefined) FirstFalseElement = FalseRangeEnd = i; // First false element. else { // Update double-compare state machine. if (SecondFalseElement == Undefined) SecondFalseElement = i; else SecondFalseElement = Overdefined; // Update range state machine. if (FalseRangeEnd == (int)i-1) FalseRangeEnd = i; else FalseRangeEnd = Overdefined; } } // If this element is in range, update our magic bitvector. if (i < 64 && IsTrueForElt) MagicBitvector |= 1ULL << i; // If all of our states become overdefined, bail out early. Since the // predicate is expensive, only check it every 8 elements. This is only // really useful for really huge arrays. if ((i & 8) == 0 && i >= 64 && SecondTrueElement == Overdefined && SecondFalseElement == Overdefined && TrueRangeEnd == Overdefined && FalseRangeEnd == Overdefined) return nullptr; } // Now that we've scanned the entire array, emit our new comparison(s). We // order the state machines in complexity of the generated code. Value *Idx = GEP->getOperand(2); // If the index is larger than the pointer size of the target, truncate the // index down like the GEP would do implicitly. We don't have to do this for // an inbounds GEP because the index can't be out of range. if (!GEP->isInBounds()) { Type *IntPtrTy = DL.getIntPtrType(GEP->getType()); unsigned PtrSize = IntPtrTy->getIntegerBitWidth(); if (Idx->getType()->getPrimitiveSizeInBits() > PtrSize) Idx = Builder->CreateTrunc(Idx, IntPtrTy); } // If the comparison is only true for one or two elements, emit direct // comparisons. if (SecondTrueElement != Overdefined) { // None true -> false. if (FirstTrueElement == Undefined) return replaceInstUsesWith(ICI, Builder->getFalse()); Value *FirstTrueIdx = ConstantInt::get(Idx->getType(), FirstTrueElement); // True for one element -> 'i == 47'. if (SecondTrueElement == Undefined) return new ICmpInst(ICmpInst::ICMP_EQ, Idx, FirstTrueIdx); // True for two elements -> 'i == 47 | i == 72'. Value *C1 = Builder->CreateICmpEQ(Idx, FirstTrueIdx); Value *SecondTrueIdx = ConstantInt::get(Idx->getType(), SecondTrueElement); Value *C2 = Builder->CreateICmpEQ(Idx, SecondTrueIdx); return BinaryOperator::CreateOr(C1, C2); } // If the comparison is only false for one or two elements, emit direct // comparisons. if (SecondFalseElement != Overdefined) { // None false -> true. if (FirstFalseElement == Undefined) return replaceInstUsesWith(ICI, Builder->getTrue()); Value *FirstFalseIdx = ConstantInt::get(Idx->getType(), FirstFalseElement); // False for one element -> 'i != 47'. if (SecondFalseElement == Undefined) return new ICmpInst(ICmpInst::ICMP_NE, Idx, FirstFalseIdx); // False for two elements -> 'i != 47 & i != 72'. Value *C1 = Builder->CreateICmpNE(Idx, FirstFalseIdx); Value *SecondFalseIdx = ConstantInt::get(Idx->getType(),SecondFalseElement); Value *C2 = Builder->CreateICmpNE(Idx, SecondFalseIdx); return BinaryOperator::CreateAnd(C1, C2); } // If the comparison can be replaced with a range comparison for the elements // where it is true, emit the range check. if (TrueRangeEnd != Overdefined) { assert(TrueRangeEnd != FirstTrueElement && "Should emit single compare"); // Generate (i-FirstTrue) getType(), -FirstTrueElement); Idx = Builder->CreateAdd(Idx, Offs); } Value *End = ConstantInt::get(Idx->getType(), TrueRangeEnd-FirstTrueElement+1); return new ICmpInst(ICmpInst::ICMP_ULT, Idx, End); } // False range check. if (FalseRangeEnd != Overdefined) { assert(FalseRangeEnd != FirstFalseElement && "Should emit single compare"); // Generate (i-FirstFalse) >u (FalseRangeEnd-FirstFalse). if (FirstFalseElement) { Value *Offs = ConstantInt::get(Idx->getType(), -FirstFalseElement); Idx = Builder->CreateAdd(Idx, Offs); } Value *End = ConstantInt::get(Idx->getType(), FalseRangeEnd-FirstFalseElement); return new ICmpInst(ICmpInst::ICMP_UGT, Idx, End); } // If a magic bitvector captures the entire comparison state // of this load, replace it with computation that does: // ((magic_cst >> i) & 1) != 0 { Type *Ty = nullptr; // Look for an appropriate type: // - The type of Idx if the magic fits // - The smallest fitting legal type if we have a DataLayout // - Default to i32 if (ArrayElementCount <= Idx->getType()->getIntegerBitWidth()) Ty = Idx->getType(); else Ty = DL.getSmallestLegalIntType(Init->getContext(), ArrayElementCount); if (Ty) { Value *V = Builder->CreateIntCast(Idx, Ty, false); V = Builder->CreateLShr(ConstantInt::get(Ty, MagicBitvector), V); V = Builder->CreateAnd(ConstantInt::get(Ty, 1), V); return new ICmpInst(ICmpInst::ICMP_NE, V, ConstantInt::get(Ty, 0)); } } return nullptr; } /// Return a value that can be used to compare the *offset* implied by a GEP to /// zero. For example, if we have &A[i], we want to return 'i' for /// "icmp ne i, 0". Note that, in general, indices can be complex, and scales /// are involved. The above expression would also be legal to codegen as /// "icmp ne (i*4), 0" (assuming A is a pointer to i32). /// This latter form is less amenable to optimization though, and we are allowed /// to generate the first by knowing that pointer arithmetic doesn't overflow. /// /// If we can't emit an optimized form for this expression, this returns null. /// static Value *EvaluateGEPOffsetExpression(User *GEP, InstCombiner &IC, const DataLayout &DL) { gep_type_iterator GTI = gep_type_begin(GEP); // Check to see if this gep only has a single variable index. If so, and if // any constant indices are a multiple of its scale, then we can compute this // in terms of the scale of the variable index. For example, if the GEP // implies an offset of "12 + i*4", then we can codegen this as "3 + i", // because the expression will cross zero at the same point. unsigned i, e = GEP->getNumOperands(); int64_t Offset = 0; for (i = 1; i != e; ++i, ++GTI) { if (ConstantInt *CI = dyn_cast(GEP->getOperand(i))) { // Compute the aggregate offset of constant indices. if (CI->isZero()) continue; // Handle a struct index, which adds its field offset to the pointer. if (StructType *STy = dyn_cast(*GTI)) { Offset += DL.getStructLayout(STy)->getElementOffset(CI->getZExtValue()); } else { uint64_t Size = DL.getTypeAllocSize(GTI.getIndexedType()); Offset += Size*CI->getSExtValue(); } } else { // Found our variable index. break; } } // If there are no variable indices, we must have a constant offset, just // evaluate it the general way. if (i == e) return nullptr; Value *VariableIdx = GEP->getOperand(i); // Determine the scale factor of the variable element. For example, this is // 4 if the variable index is into an array of i32. uint64_t VariableScale = DL.getTypeAllocSize(GTI.getIndexedType()); // Verify that there are no other variable indices. If so, emit the hard way. for (++i, ++GTI; i != e; ++i, ++GTI) { ConstantInt *CI = dyn_cast(GEP->getOperand(i)); if (!CI) return nullptr; // Compute the aggregate offset of constant indices. if (CI->isZero()) continue; // Handle a struct index, which adds its field offset to the pointer. if (StructType *STy = dyn_cast(*GTI)) { Offset += DL.getStructLayout(STy)->getElementOffset(CI->getZExtValue()); } else { uint64_t Size = DL.getTypeAllocSize(GTI.getIndexedType()); Offset += Size*CI->getSExtValue(); } } // Okay, we know we have a single variable index, which must be a // pointer/array/vector index. If there is no offset, life is simple, return // the index. Type *IntPtrTy = DL.getIntPtrType(GEP->getOperand(0)->getType()); unsigned IntPtrWidth = IntPtrTy->getIntegerBitWidth(); if (Offset == 0) { // Cast to intptrty in case a truncation occurs. If an extension is needed, // we don't need to bother extending: the extension won't affect where the // computation crosses zero. if (VariableIdx->getType()->getPrimitiveSizeInBits() > IntPtrWidth) { VariableIdx = IC.Builder->CreateTrunc(VariableIdx, IntPtrTy); } return VariableIdx; } // Otherwise, there is an index. The computation we will do will be modulo // the pointer size, so get it. uint64_t PtrSizeMask = ~0ULL >> (64-IntPtrWidth); Offset &= PtrSizeMask; VariableScale &= PtrSizeMask; // To do this transformation, any constant index must be a multiple of the // variable scale factor. For example, we can evaluate "12 + 4*i" as "3 + i", // but we can't evaluate "10 + 3*i" in terms of i. Check that the offset is a // multiple of the variable scale. int64_t NewOffs = Offset / (int64_t)VariableScale; if (Offset != NewOffs*(int64_t)VariableScale) return nullptr; // Okay, we can do this evaluation. Start by converting the index to intptr. if (VariableIdx->getType() != IntPtrTy) VariableIdx = IC.Builder->CreateIntCast(VariableIdx, IntPtrTy, true /*Signed*/); Constant *OffsetVal = ConstantInt::get(IntPtrTy, NewOffs); return IC.Builder->CreateAdd(VariableIdx, OffsetVal, "offset"); } /// Returns true if we can rewrite Start as a GEP with pointer Base /// and some integer offset. The nodes that need to be re-written /// for this transformation will be added to Explored. static bool canRewriteGEPAsOffset(Value *Start, Value *Base, const DataLayout &DL, SetVector &Explored) { SmallVector WorkList(1, Start); Explored.insert(Base); // The following traversal gives us an order which can be used // when doing the final transformation. Since in the final // transformation we create the PHI replacement instructions first, // we don't have to get them in any particular order. // // However, for other instructions we will have to traverse the // operands of an instruction first, which means that we have to // do a post-order traversal. while (!WorkList.empty()) { SetVector PHIs; while (!WorkList.empty()) { if (Explored.size() >= 100) return false; Value *V = WorkList.back(); if (Explored.count(V) != 0) { WorkList.pop_back(); continue; } if (!isa(V) && !isa(V) && - !isa(V) && !isa(V)) + !isa(V) && !isa(V)) // We've found some value that we can't explore which is different from // the base. Therefore we can't do this transformation. return false; if (isa(V) || isa(V)) { auto *CI = dyn_cast(V); if (!CI->isNoopCast(DL)) return false; if (Explored.count(CI->getOperand(0)) == 0) WorkList.push_back(CI->getOperand(0)); } if (auto *GEP = dyn_cast(V)) { // We're limiting the GEP to having one index. This will preserve // the original pointer type. We could handle more cases in the // future. if (GEP->getNumIndices() != 1 || !GEP->isInBounds() || GEP->getType() != Start->getType()) return false; if (Explored.count(GEP->getOperand(0)) == 0) WorkList.push_back(GEP->getOperand(0)); } if (WorkList.back() == V) { WorkList.pop_back(); // We've finished visiting this node, mark it as such. Explored.insert(V); } if (auto *PN = dyn_cast(V)) { // We cannot transform PHIs on unsplittable basic blocks. if (isa(PN->getParent()->getTerminator())) return false; Explored.insert(PN); PHIs.insert(PN); } } // Explore the PHI nodes further. for (auto *PN : PHIs) for (Value *Op : PN->incoming_values()) if (Explored.count(Op) == 0) WorkList.push_back(Op); } // Make sure that we can do this. Since we can't insert GEPs in a basic // block before a PHI node, we can't easily do this transformation if // we have PHI node users of transformed instructions. for (Value *Val : Explored) { for (Value *Use : Val->uses()) { auto *PHI = dyn_cast(Use); auto *Inst = dyn_cast(Val); if (Inst == Base || Inst == PHI || !Inst || !PHI || Explored.count(PHI) == 0) continue; if (PHI->getParent() == Inst->getParent()) return false; } } return true; } // Sets the appropriate insert point on Builder where we can add // a replacement Instruction for V (if that is possible). static void setInsertionPoint(IRBuilder<> &Builder, Value *V, bool Before = true) { if (auto *PHI = dyn_cast(V)) { Builder.SetInsertPoint(&*PHI->getParent()->getFirstInsertionPt()); return; } if (auto *I = dyn_cast(V)) { if (!Before) I = &*std::next(I->getIterator()); Builder.SetInsertPoint(I); return; } if (auto *A = dyn_cast(V)) { // Set the insertion point in the entry block. BasicBlock &Entry = A->getParent()->getEntryBlock(); Builder.SetInsertPoint(&*Entry.getFirstInsertionPt()); return; } // Otherwise, this is a constant and we don't need to set a new // insertion point. assert(isa(V) && "Setting insertion point for unknown value!"); } /// Returns a re-written value of Start as an indexed GEP using Base as a /// pointer. static Value *rewriteGEPAsOffset(Value *Start, Value *Base, const DataLayout &DL, SetVector &Explored) { // Perform all the substitutions. This is a bit tricky because we can // have cycles in our use-def chains. // 1. Create the PHI nodes without any incoming values. // 2. Create all the other values. // 3. Add the edges for the PHI nodes. // 4. Emit GEPs to get the original pointers. // 5. Remove the original instructions. Type *IndexType = IntegerType::get( Base->getContext(), DL.getPointerTypeSizeInBits(Start->getType())); DenseMap NewInsts; NewInsts[Base] = ConstantInt::getNullValue(IndexType); // Create the new PHI nodes, without adding any incoming values. for (Value *Val : Explored) { if (Val == Base) continue; // Create empty phi nodes. This avoids cyclic dependencies when creating // the remaining instructions. if (auto *PHI = dyn_cast(Val)) NewInsts[PHI] = PHINode::Create(IndexType, PHI->getNumIncomingValues(), PHI->getName() + ".idx", PHI); } IRBuilder<> Builder(Base->getContext()); // Create all the other instructions. for (Value *Val : Explored) { if (NewInsts.find(Val) != NewInsts.end()) continue; if (auto *CI = dyn_cast(Val)) { NewInsts[CI] = NewInsts[CI->getOperand(0)]; continue; } if (auto *GEP = dyn_cast(Val)) { Value *Index = NewInsts[GEP->getOperand(1)] ? NewInsts[GEP->getOperand(1)] : GEP->getOperand(1); setInsertionPoint(Builder, GEP); // Indices might need to be sign extended. GEPs will magically do // this, but we need to do it ourselves here. if (Index->getType()->getScalarSizeInBits() != NewInsts[GEP->getOperand(0)]->getType()->getScalarSizeInBits()) { Index = Builder.CreateSExtOrTrunc( Index, NewInsts[GEP->getOperand(0)]->getType(), GEP->getOperand(0)->getName() + ".sext"); } auto *Op = NewInsts[GEP->getOperand(0)]; if (isa(Op) && dyn_cast(Op)->isZero()) NewInsts[GEP] = Index; else NewInsts[GEP] = Builder.CreateNSWAdd( Op, Index, GEP->getOperand(0)->getName() + ".add"); continue; } if (isa(Val)) continue; llvm_unreachable("Unexpected instruction type"); } // Add the incoming values to the PHI nodes. for (Value *Val : Explored) { if (Val == Base) continue; // All the instructions have been created, we can now add edges to the // phi nodes. if (auto *PHI = dyn_cast(Val)) { PHINode *NewPhi = static_cast(NewInsts[PHI]); for (unsigned I = 0, E = PHI->getNumIncomingValues(); I < E; ++I) { Value *NewIncoming = PHI->getIncomingValue(I); if (NewInsts.find(NewIncoming) != NewInsts.end()) NewIncoming = NewInsts[NewIncoming]; NewPhi->addIncoming(NewIncoming, PHI->getIncomingBlock(I)); } } } for (Value *Val : Explored) { if (Val == Base) continue; // Depending on the type, for external users we have to emit // a GEP or a GEP + ptrtoint. setInsertionPoint(Builder, Val, false); // If required, create an inttoptr instruction for Base. Value *NewBase = Base; if (!Base->getType()->isPointerTy()) NewBase = Builder.CreateBitOrPointerCast(Base, Start->getType(), Start->getName() + "to.ptr"); Value *GEP = Builder.CreateInBoundsGEP( Start->getType()->getPointerElementType(), NewBase, makeArrayRef(NewInsts[Val]), Val->getName() + ".ptr"); if (!Val->getType()->isPointerTy()) { Value *Cast = Builder.CreatePointerCast(GEP, Val->getType(), Val->getName() + ".conv"); GEP = Cast; } Val->replaceAllUsesWith(GEP); } return NewInsts[Start]; } /// Looks through GEPs, IntToPtrInsts and PtrToIntInsts in order to express /// the input Value as a constant indexed GEP. Returns a pair containing /// the GEPs Pointer and Index. static std::pair getAsConstantIndexedAddress(Value *V, const DataLayout &DL) { Type *IndexType = IntegerType::get(V->getContext(), DL.getPointerTypeSizeInBits(V->getType())); Constant *Index = ConstantInt::getNullValue(IndexType); while (true) { if (GEPOperator *GEP = dyn_cast(V)) { // We accept only inbouds GEPs here to exclude the possibility of // overflow. if (!GEP->isInBounds()) break; if (GEP->hasAllConstantIndices() && GEP->getNumIndices() == 1 && GEP->getType() == V->getType()) { V = GEP->getOperand(0); Constant *GEPIndex = static_cast(GEP->getOperand(1)); Index = ConstantExpr::getAdd( Index, ConstantExpr::getSExtOrBitCast(GEPIndex, IndexType)); continue; } break; } if (auto *CI = dyn_cast(V)) { if (!CI->isNoopCast(DL)) break; V = CI->getOperand(0); continue; } if (auto *CI = dyn_cast(V)) { if (!CI->isNoopCast(DL)) break; V = CI->getOperand(0); continue; } break; } return {V, Index}; } /// Converts (CMP GEPLHS, RHS) if this change would make RHS a constant. /// We can look through PHIs, GEPs and casts in order to determine a common base /// between GEPLHS and RHS. static Instruction *transformToIndexedCompare(GEPOperator *GEPLHS, Value *RHS, ICmpInst::Predicate Cond, const DataLayout &DL) { if (!GEPLHS->hasAllConstantIndices()) return nullptr; Value *PtrBase, *Index; std::tie(PtrBase, Index) = getAsConstantIndexedAddress(GEPLHS, DL); // The set of nodes that will take part in this transformation. SetVector Nodes; if (!canRewriteGEPAsOffset(RHS, PtrBase, DL, Nodes)) return nullptr; // We know we can re-write this as // ((gep Ptr, OFFSET1) cmp (gep Ptr, OFFSET2) // Since we've only looked through inbouds GEPs we know that we // can't have overflow on either side. We can therefore re-write // this as: // OFFSET1 cmp OFFSET2 Value *NewRHS = rewriteGEPAsOffset(RHS, PtrBase, DL, Nodes); // RewriteGEPAsOffset has replaced RHS and all of its uses with a re-written // GEP having PtrBase as the pointer base, and has returned in NewRHS the // offset. Since Index is the offset of LHS to the base pointer, we will now // compare the offsets instead of comparing the pointers. return new ICmpInst(ICmpInst::getSignedPredicate(Cond), Index, NewRHS); } /// Fold comparisons between a GEP instruction and something else. At this point /// we know that the GEP is on the LHS of the comparison. Instruction *InstCombiner::FoldGEPICmp(GEPOperator *GEPLHS, Value *RHS, ICmpInst::Predicate Cond, Instruction &I) { // Don't transform signed compares of GEPs into index compares. Even if the // GEP is inbounds, the final add of the base pointer can have signed overflow // and would change the result of the icmp. // e.g. "&foo[0] (RHS)) RHS = RHS->stripPointerCasts(); Value *PtrBase = GEPLHS->getOperand(0); if (PtrBase == RHS && GEPLHS->isInBounds()) { // ((gep Ptr, OFFSET) cmp Ptr) ---> (OFFSET cmp 0). // This transformation (ignoring the base and scales) is valid because we // know pointers can't overflow since the gep is inbounds. See if we can // output an optimized form. Value *Offset = EvaluateGEPOffsetExpression(GEPLHS, *this, DL); // If not, synthesize the offset the hard way. if (!Offset) Offset = EmitGEPOffset(GEPLHS); return new ICmpInst(ICmpInst::getSignedPredicate(Cond), Offset, Constant::getNullValue(Offset->getType())); } else if (GEPOperator *GEPRHS = dyn_cast(RHS)) { // If the base pointers are different, but the indices are the same, just // compare the base pointer. if (PtrBase != GEPRHS->getOperand(0)) { bool IndicesTheSame = GEPLHS->getNumOperands()==GEPRHS->getNumOperands(); IndicesTheSame &= GEPLHS->getOperand(0)->getType() == GEPRHS->getOperand(0)->getType(); if (IndicesTheSame) for (unsigned i = 1, e = GEPLHS->getNumOperands(); i != e; ++i) if (GEPLHS->getOperand(i) != GEPRHS->getOperand(i)) { IndicesTheSame = false; break; } // If all indices are the same, just compare the base pointers. if (IndicesTheSame) return new ICmpInst(Cond, GEPLHS->getOperand(0), GEPRHS->getOperand(0)); // If we're comparing GEPs with two base pointers that only differ in type // and both GEPs have only constant indices or just one use, then fold // the compare with the adjusted indices. if (GEPLHS->isInBounds() && GEPRHS->isInBounds() && (GEPLHS->hasAllConstantIndices() || GEPLHS->hasOneUse()) && (GEPRHS->hasAllConstantIndices() || GEPRHS->hasOneUse()) && PtrBase->stripPointerCasts() == GEPRHS->getOperand(0)->stripPointerCasts()) { Value *LOffset = EmitGEPOffset(GEPLHS); Value *ROffset = EmitGEPOffset(GEPRHS); // If we looked through an addrspacecast between different sized address // spaces, the LHS and RHS pointers are different sized // integers. Truncate to the smaller one. Type *LHSIndexTy = LOffset->getType(); Type *RHSIndexTy = ROffset->getType(); if (LHSIndexTy != RHSIndexTy) { if (LHSIndexTy->getPrimitiveSizeInBits() < RHSIndexTy->getPrimitiveSizeInBits()) { ROffset = Builder->CreateTrunc(ROffset, LHSIndexTy); } else LOffset = Builder->CreateTrunc(LOffset, RHSIndexTy); } Value *Cmp = Builder->CreateICmp(ICmpInst::getSignedPredicate(Cond), LOffset, ROffset); return replaceInstUsesWith(I, Cmp); } // Otherwise, the base pointers are different and the indices are // different. Try convert this to an indexed compare by looking through // PHIs/casts. return transformToIndexedCompare(GEPLHS, RHS, Cond, DL); } // If one of the GEPs has all zero indices, recurse. if (GEPLHS->hasAllZeroIndices()) return FoldGEPICmp(GEPRHS, GEPLHS->getOperand(0), ICmpInst::getSwappedPredicate(Cond), I); // If the other GEP has all zero indices, recurse. if (GEPRHS->hasAllZeroIndices()) return FoldGEPICmp(GEPLHS, GEPRHS->getOperand(0), Cond, I); bool GEPsInBounds = GEPLHS->isInBounds() && GEPRHS->isInBounds(); if (GEPLHS->getNumOperands() == GEPRHS->getNumOperands()) { // If the GEPs only differ by one index, compare it. unsigned NumDifferences = 0; // Keep track of # differences. unsigned DiffOperand = 0; // The operand that differs. for (unsigned i = 1, e = GEPRHS->getNumOperands(); i != e; ++i) if (GEPLHS->getOperand(i) != GEPRHS->getOperand(i)) { if (GEPLHS->getOperand(i)->getType()->getPrimitiveSizeInBits() != GEPRHS->getOperand(i)->getType()->getPrimitiveSizeInBits()) { // Irreconcilable differences. NumDifferences = 2; break; } else { if (NumDifferences++) break; DiffOperand = i; } } if (NumDifferences == 0) // SAME GEP? return replaceInstUsesWith(I, // No comparison is needed here. Builder->getInt1(ICmpInst::isTrueWhenEqual(Cond))); else if (NumDifferences == 1 && GEPsInBounds) { Value *LHSV = GEPLHS->getOperand(DiffOperand); Value *RHSV = GEPRHS->getOperand(DiffOperand); // Make sure we do a signed comparison here. return new ICmpInst(ICmpInst::getSignedPredicate(Cond), LHSV, RHSV); } } // Only lower this if the icmp is the only user of the GEP or if we expect // the result to fold to a constant! if (GEPsInBounds && (isa(GEPLHS) || GEPLHS->hasOneUse()) && (isa(GEPRHS) || GEPRHS->hasOneUse())) { // ((gep Ptr, OFFSET1) cmp (gep Ptr, OFFSET2) ---> (OFFSET1 cmp OFFSET2) Value *L = EmitGEPOffset(GEPLHS); Value *R = EmitGEPOffset(GEPRHS); return new ICmpInst(ICmpInst::getSignedPredicate(Cond), L, R); } } // Try convert this to an indexed compare by looking through PHIs/casts as a // last resort. return transformToIndexedCompare(GEPLHS, RHS, Cond, DL); } Instruction *InstCombiner::FoldAllocaCmp(ICmpInst &ICI, AllocaInst *Alloca, Value *Other) { assert(ICI.isEquality() && "Cannot fold non-equality comparison."); // It would be tempting to fold away comparisons between allocas and any // pointer not based on that alloca (e.g. an argument). However, even // though such pointers cannot alias, they can still compare equal. // // But LLVM doesn't specify where allocas get their memory, so if the alloca // doesn't escape we can argue that it's impossible to guess its value, and we // can therefore act as if any such guesses are wrong. // // The code below checks that the alloca doesn't escape, and that it's only // used in a comparison once (the current instruction). The // single-comparison-use condition ensures that we're trivially folding all // comparisons against the alloca consistently, and avoids the risk of // erroneously folding a comparison of the pointer with itself. unsigned MaxIter = 32; // Break cycles and bound to constant-time. SmallVector Worklist; for (Use &U : Alloca->uses()) { if (Worklist.size() >= MaxIter) return nullptr; Worklist.push_back(&U); } unsigned NumCmps = 0; while (!Worklist.empty()) { assert(Worklist.size() <= MaxIter); Use *U = Worklist.pop_back_val(); Value *V = U->getUser(); --MaxIter; if (isa(V) || isa(V) || isa(V) || isa(V)) { // Track the uses. } else if (isa(V)) { // Loading from the pointer doesn't escape it. continue; } else if (auto *SI = dyn_cast(V)) { // Storing *to* the pointer is fine, but storing the pointer escapes it. if (SI->getValueOperand() == U->get()) return nullptr; continue; } else if (isa(V)) { if (NumCmps++) return nullptr; // Found more than one cmp. continue; } else if (auto *Intrin = dyn_cast(V)) { switch (Intrin->getIntrinsicID()) { // These intrinsics don't escape or compare the pointer. Memset is safe // because we don't allow ptrtoint. Memcpy and memmove are safe because // we don't allow stores, so src cannot point to V. case Intrinsic::lifetime_start: case Intrinsic::lifetime_end: case Intrinsic::dbg_declare: case Intrinsic::dbg_value: case Intrinsic::memcpy: case Intrinsic::memmove: case Intrinsic::memset: continue; default: return nullptr; } } else { return nullptr; } for (Use &U : V->uses()) { if (Worklist.size() >= MaxIter) return nullptr; Worklist.push_back(&U); } } Type *CmpTy = CmpInst::makeCmpResultType(Other->getType()); return replaceInstUsesWith( ICI, ConstantInt::get(CmpTy, !CmpInst::isTrueWhenEqual(ICI.getPredicate()))); } /// Fold "icmp pred (X+CI), X". Instruction *InstCombiner::FoldICmpAddOpCst(Instruction &ICI, Value *X, ConstantInt *CI, ICmpInst::Predicate Pred) { // From this point on, we know that (X+C <= X) --> (X+C < X) because C != 0, // so the values can never be equal. Similarly for all other "or equals" // operators. // (X+1) X >u (MAXUINT-1) --> X == 255 // (X+2) X >u (MAXUINT-2) --> X > 253 // (X+MAXUINT) X >u (MAXUINT-MAXUINT) --> X != 0 if (Pred == ICmpInst::ICMP_ULT || Pred == ICmpInst::ICMP_ULE) { Value *R = ConstantExpr::getSub(ConstantInt::getAllOnesValue(CI->getType()), CI); return new ICmpInst(ICmpInst::ICMP_UGT, X, R); } // (X+1) >u X --> X X != 255 // (X+2) >u X --> X X u X --> X X X == 0 if (Pred == ICmpInst::ICMP_UGT || Pred == ICmpInst::ICMP_UGE) return new ICmpInst(ICmpInst::ICMP_ULT, X, ConstantExpr::getNeg(CI)); unsigned BitWidth = CI->getType()->getPrimitiveSizeInBits(); ConstantInt *SMax = ConstantInt::get(X->getContext(), APInt::getSignedMaxValue(BitWidth)); // (X+ 1) X >s (MAXSINT-1) --> X == 127 // (X+ 2) X >s (MAXSINT-2) --> X >s 125 // (X+MAXSINT) X >s (MAXSINT-MAXSINT) --> X >s 0 // (X+MINSINT) X >s (MAXSINT-MINSINT) --> X >s -1 // (X+ -2) X >s (MAXSINT- -2) --> X >s 126 // (X+ -1) X >s (MAXSINT- -1) --> X != 127 if (Pred == ICmpInst::ICMP_SLT || Pred == ICmpInst::ICMP_SLE) return new ICmpInst(ICmpInst::ICMP_SGT, X, ConstantExpr::getSub(SMax, CI)); // (X+ 1) >s X --> X X != 127 // (X+ 2) >s X --> X X s X --> X X s X --> X X s X --> X X s X --> X X == -128 assert(Pred == ICmpInst::ICMP_SGT || Pred == ICmpInst::ICMP_SGE); Constant *C = Builder->getInt(CI->getValue()-1); return new ICmpInst(ICmpInst::ICMP_SLT, X, ConstantExpr::getSub(SMax, C)); } /// Fold "icmp pred, ([su]div X, DivRHS), CmpRHS" where DivRHS and CmpRHS are /// both known to be integer constants. Instruction *InstCombiner::FoldICmpDivCst(ICmpInst &ICI, BinaryOperator *DivI, ConstantInt *DivRHS) { ConstantInt *CmpRHS = cast(ICI.getOperand(1)); const APInt &CmpRHSV = CmpRHS->getValue(); // FIXME: If the operand types don't match the type of the divide // then don't attempt this transform. The code below doesn't have the // logic to deal with a signed divide and an unsigned compare (and // vice versa). This is because (x /s C1) getOpcode() == Instruction::SDiv; if (!ICI.isEquality() && DivIsSigned != ICI.isSigned()) return nullptr; if (DivRHS->isZero()) return nullptr; // The ProdOV computation fails on divide by zero. if (DivIsSigned && DivRHS->isAllOnesValue()) return nullptr; // The overflow computation also screws up here if (DivRHS->isOne()) { // This eliminates some funny cases with INT_MIN. ICI.setOperand(0, DivI->getOperand(0)); // X/1 == X. return &ICI; } // Compute Prod = CI * DivRHS. We are essentially solving an equation // of form X/C1=C2. We solve for X by multiplying C1 (DivRHS) and // C2 (CI). By solving for X we can turn this into a range check // instead of computing a divide. Constant *Prod = ConstantExpr::getMul(CmpRHS, DivRHS); // Determine if the product overflows by seeing if the product is // not equal to the divide. Make sure we do the same kind of divide // as in the LHS instruction that we're folding. bool ProdOV = (DivIsSigned ? ConstantExpr::getSDiv(Prod, DivRHS) : ConstantExpr::getUDiv(Prod, DivRHS)) != CmpRHS; // Get the ICmp opcode ICmpInst::Predicate Pred = ICI.getPredicate(); // If the division is known to be exact, then there is no remainder from the // divide, so the covered range size is unit, otherwise it is the divisor. ConstantInt *RangeSize = DivI->isExact() ? getOne(Prod) : DivRHS; // Figure out the interval that is being checked. For example, a comparison // like "X /u 5 == 0" is really checking that X is in the interval [0, 5). // Compute this interval based on the constants involved and the signedness of // the compare/divide. This computes a half-open interval, keeping track of // whether either value in the interval overflows. After analysis each // overflow variable is set to 0 if it's corresponding bound variable is valid // -1 if overflowed off the bottom end, or +1 if overflowed off the top end. int LoOverflow = 0, HiOverflow = 0; Constant *LoBound = nullptr, *HiBound = nullptr; if (!DivIsSigned) { // udiv // e.g. X/5 op 3 --> [15, 20) LoBound = Prod; HiOverflow = LoOverflow = ProdOV; if (!HiOverflow) { // If this is not an exact divide, then many values in the range collapse // to the same result value. HiOverflow = AddWithOverflow(HiBound, LoBound, RangeSize, false); } } else if (DivRHS->getValue().isStrictlyPositive()) { // Divisor is > 0. if (CmpRHSV == 0) { // (X / pos) op 0 // Can't overflow. e.g. X/2 op 0 --> [-1, 2) LoBound = ConstantExpr::getNeg(SubOne(RangeSize)); HiBound = RangeSize; } else if (CmpRHSV.isStrictlyPositive()) { // (X / pos) op pos LoBound = Prod; // e.g. X/5 op 3 --> [15, 20) HiOverflow = LoOverflow = ProdOV; if (!HiOverflow) HiOverflow = AddWithOverflow(HiBound, Prod, RangeSize, true); } else { // (X / pos) op neg // e.g. X/5 op -3 --> [-15-4, -15+1) --> [-19, -14) HiBound = AddOne(Prod); LoOverflow = HiOverflow = ProdOV ? -1 : 0; if (!LoOverflow) { ConstantInt *DivNeg =cast(ConstantExpr::getNeg(RangeSize)); LoOverflow = AddWithOverflow(LoBound, HiBound, DivNeg, true) ? -1 : 0; } } } else if (DivRHS->isNegative()) { // Divisor is < 0. if (DivI->isExact()) RangeSize = cast(ConstantExpr::getNeg(RangeSize)); if (CmpRHSV == 0) { // (X / neg) op 0 // e.g. X/-5 op 0 --> [-4, 5) LoBound = AddOne(RangeSize); HiBound = cast(ConstantExpr::getNeg(RangeSize)); if (HiBound == DivRHS) { // -INTMIN = INTMIN HiOverflow = 1; // [INTMIN+1, overflow) HiBound = nullptr; // e.g. X/INTMIN = 0 --> X > INTMIN } } else if (CmpRHSV.isStrictlyPositive()) { // (X / neg) op pos // e.g. X/-5 op 3 --> [-19, -14) HiBound = AddOne(Prod); HiOverflow = LoOverflow = ProdOV ? -1 : 0; if (!LoOverflow) LoOverflow = AddWithOverflow(LoBound, HiBound, RangeSize, true) ? -1:0; } else { // (X / neg) op neg LoBound = Prod; // e.g. X/-5 op -3 --> [15, 20) LoOverflow = HiOverflow = ProdOV; if (!HiOverflow) HiOverflow = SubWithOverflow(HiBound, Prod, RangeSize, true); } // Dividing by a negative swaps the condition. LT <-> GT Pred = ICmpInst::getSwappedPredicate(Pred); } Value *X = DivI->getOperand(0); switch (Pred) { default: llvm_unreachable("Unhandled icmp opcode!"); case ICmpInst::ICMP_EQ: if (LoOverflow && HiOverflow) return replaceInstUsesWith(ICI, Builder->getFalse()); if (HiOverflow) return new ICmpInst(DivIsSigned ? ICmpInst::ICMP_SGE : ICmpInst::ICMP_UGE, X, LoBound); if (LoOverflow) return new ICmpInst(DivIsSigned ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT, X, HiBound); return replaceInstUsesWith(ICI, InsertRangeTest(X, LoBound, HiBound, DivIsSigned, true)); case ICmpInst::ICMP_NE: if (LoOverflow && HiOverflow) return replaceInstUsesWith(ICI, Builder->getTrue()); if (HiOverflow) return new ICmpInst(DivIsSigned ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT, X, LoBound); if (LoOverflow) return new ICmpInst(DivIsSigned ? ICmpInst::ICMP_SGE : ICmpInst::ICMP_UGE, X, HiBound); return replaceInstUsesWith(ICI, InsertRangeTest(X, LoBound, HiBound, DivIsSigned, false)); case ICmpInst::ICMP_ULT: case ICmpInst::ICMP_SLT: if (LoOverflow == +1) // Low bound is greater than input range. return replaceInstUsesWith(ICI, Builder->getTrue()); if (LoOverflow == -1) // Low bound is less than input range. return replaceInstUsesWith(ICI, Builder->getFalse()); return new ICmpInst(Pred, X, LoBound); case ICmpInst::ICMP_UGT: case ICmpInst::ICMP_SGT: if (HiOverflow == +1) // High bound greater than input range. return replaceInstUsesWith(ICI, Builder->getFalse()); if (HiOverflow == -1) // High bound less than input range. return replaceInstUsesWith(ICI, Builder->getTrue()); if (Pred == ICmpInst::ICMP_UGT) return new ICmpInst(ICmpInst::ICMP_UGE, X, HiBound); return new ICmpInst(ICmpInst::ICMP_SGE, X, HiBound); } } /// Handle "icmp(([al]shr X, cst1), cst2)". Instruction *InstCombiner::FoldICmpShrCst(ICmpInst &ICI, BinaryOperator *Shr, ConstantInt *ShAmt) { const APInt &CmpRHSV = cast(ICI.getOperand(1))->getValue(); // Check that the shift amount is in range. If not, don't perform // undefined shifts. When the shift is visited it will be // simplified. uint32_t TypeBits = CmpRHSV.getBitWidth(); uint32_t ShAmtVal = (uint32_t)ShAmt->getLimitedValue(TypeBits); if (ShAmtVal >= TypeBits || ShAmtVal == 0) return nullptr; if (!ICI.isEquality()) { // If we have an unsigned comparison and an ashr, we can't simplify this. // Similarly for signed comparisons with lshr. if (ICI.isSigned() != (Shr->getOpcode() == Instruction::AShr)) return nullptr; // Otherwise, all lshr and most exact ashr's are equivalent to a udiv/sdiv // by a power of 2. Since we already have logic to simplify these, // transform to div and then simplify the resultant comparison. if (Shr->getOpcode() == Instruction::AShr && (!Shr->isExact() || ShAmtVal == TypeBits - 1)) return nullptr; // Revisit the shift (to delete it). Worklist.Add(Shr); Constant *DivCst = ConstantInt::get(Shr->getType(), APInt::getOneBitSet(TypeBits, ShAmtVal)); Value *Tmp = Shr->getOpcode() == Instruction::AShr ? Builder->CreateSDiv(Shr->getOperand(0), DivCst, "", Shr->isExact()) : Builder->CreateUDiv(Shr->getOperand(0), DivCst, "", Shr->isExact()); ICI.setOperand(0, Tmp); // If the builder folded the binop, just return it. BinaryOperator *TheDiv = dyn_cast(Tmp); if (!TheDiv) return &ICI; // Otherwise, fold this div/compare. assert(TheDiv->getOpcode() == Instruction::SDiv || TheDiv->getOpcode() == Instruction::UDiv); Instruction *Res = FoldICmpDivCst(ICI, TheDiv, cast(DivCst)); assert(Res && "This div/cst should have folded!"); return Res; } // If we are comparing against bits always shifted out, the // comparison cannot succeed. APInt Comp = CmpRHSV << ShAmtVal; ConstantInt *ShiftedCmpRHS = Builder->getInt(Comp); if (Shr->getOpcode() == Instruction::LShr) Comp = Comp.lshr(ShAmtVal); else Comp = Comp.ashr(ShAmtVal); if (Comp != CmpRHSV) { // Comparing against a bit that we know is zero. bool IsICMP_NE = ICI.getPredicate() == ICmpInst::ICMP_NE; Constant *Cst = Builder->getInt1(IsICMP_NE); return replaceInstUsesWith(ICI, Cst); } // Otherwise, check to see if the bits shifted out are known to be zero. // If so, we can compare against the unshifted value: // (X & 4) >> 1 == 2 --> (X & 4) == 4. if (Shr->hasOneUse() && Shr->isExact()) return new ICmpInst(ICI.getPredicate(), Shr->getOperand(0), ShiftedCmpRHS); if (Shr->hasOneUse()) { // Otherwise strength reduce the shift into an and. APInt Val(APInt::getHighBitsSet(TypeBits, TypeBits - ShAmtVal)); Constant *Mask = Builder->getInt(Val); Value *And = Builder->CreateAnd(Shr->getOperand(0), Mask, Shr->getName()+".mask"); return new ICmpInst(ICI.getPredicate(), And, ShiftedCmpRHS); } return nullptr; } /// Handle "(icmp eq/ne (ashr/lshr const2, A), const1)" -> /// (icmp eq/ne A, Log2(const2/const1)) -> /// (icmp eq/ne A, Log2(const2) - Log2(const1)). Instruction *InstCombiner::FoldICmpCstShrCst(ICmpInst &I, Value *Op, Value *A, ConstantInt *CI1, ConstantInt *CI2) { assert(I.isEquality() && "Cannot fold icmp gt/lt"); auto getConstant = [&I, this](bool IsTrue) { if (I.getPredicate() == I.ICMP_NE) IsTrue = !IsTrue; return replaceInstUsesWith(I, ConstantInt::get(I.getType(), IsTrue)); }; auto getICmp = [&I](CmpInst::Predicate Pred, Value *LHS, Value *RHS) { if (I.getPredicate() == I.ICMP_NE) Pred = CmpInst::getInversePredicate(Pred); return new ICmpInst(Pred, LHS, RHS); }; const APInt &AP1 = CI1->getValue(); const APInt &AP2 = CI2->getValue(); // Don't bother doing any work for cases which InstSimplify handles. if (AP2 == 0) return nullptr; bool IsAShr = isa(Op); if (IsAShr) { if (AP2.isAllOnesValue()) return nullptr; if (AP2.isNegative() != AP1.isNegative()) return nullptr; if (AP2.sgt(AP1)) return nullptr; } if (!AP1) // 'A' must be large enough to shift out the highest set bit. return getICmp(I.ICMP_UGT, A, ConstantInt::get(A->getType(), AP2.logBase2())); if (AP1 == AP2) return getICmp(I.ICMP_EQ, A, ConstantInt::getNullValue(A->getType())); int Shift; if (IsAShr && AP1.isNegative()) Shift = AP1.countLeadingOnes() - AP2.countLeadingOnes(); else Shift = AP1.countLeadingZeros() - AP2.countLeadingZeros(); if (Shift > 0) { if (IsAShr && AP1 == AP2.ashr(Shift)) { // There are multiple solutions if we are comparing against -1 and the LHS // of the ashr is not a power of two. if (AP1.isAllOnesValue() && !AP2.isPowerOf2()) return getICmp(I.ICMP_UGE, A, ConstantInt::get(A->getType(), Shift)); return getICmp(I.ICMP_EQ, A, ConstantInt::get(A->getType(), Shift)); } else if (AP1 == AP2.lshr(Shift)) { return getICmp(I.ICMP_EQ, A, ConstantInt::get(A->getType(), Shift)); } } // Shifting const2 will never be equal to const1. return getConstant(false); } /// Handle "(icmp eq/ne (shl const2, A), const1)" -> /// (icmp eq/ne A, TrailingZeros(const1) - TrailingZeros(const2)). Instruction *InstCombiner::FoldICmpCstShlCst(ICmpInst &I, Value *Op, Value *A, ConstantInt *CI1, ConstantInt *CI2) { assert(I.isEquality() && "Cannot fold icmp gt/lt"); auto getConstant = [&I, this](bool IsTrue) { if (I.getPredicate() == I.ICMP_NE) IsTrue = !IsTrue; return replaceInstUsesWith(I, ConstantInt::get(I.getType(), IsTrue)); }; auto getICmp = [&I](CmpInst::Predicate Pred, Value *LHS, Value *RHS) { if (I.getPredicate() == I.ICMP_NE) Pred = CmpInst::getInversePredicate(Pred); return new ICmpInst(Pred, LHS, RHS); }; const APInt &AP1 = CI1->getValue(); const APInt &AP2 = CI2->getValue(); // Don't bother doing any work for cases which InstSimplify handles. if (AP2 == 0) return nullptr; unsigned AP2TrailingZeros = AP2.countTrailingZeros(); if (!AP1 && AP2TrailingZeros != 0) return getICmp(I.ICMP_UGE, A, ConstantInt::get(A->getType(), AP2.getBitWidth() - AP2TrailingZeros)); if (AP1 == AP2) return getICmp(I.ICMP_EQ, A, ConstantInt::getNullValue(A->getType())); // Get the distance between the lowest bits that are set. int Shift = AP1.countTrailingZeros() - AP2TrailingZeros; if (Shift > 0 && AP2.shl(Shift) == AP1) return getICmp(I.ICMP_EQ, A, ConstantInt::get(A->getType(), Shift)); // Shifting const2 will never be equal to const1. return getConstant(false); } /// Handle "icmp (instr, intcst)". Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI, Instruction *LHSI, ConstantInt *RHS) { const APInt &RHSV = RHS->getValue(); switch (LHSI->getOpcode()) { case Instruction::Trunc: if (RHS->isOne() && RHSV.getBitWidth() > 1) { // icmp slt trunc(signum(V)) 1 --> icmp slt V, 1 Value *V = nullptr; if (ICI.getPredicate() == ICmpInst::ICMP_SLT && match(LHSI->getOperand(0), m_Signum(m_Value(V)))) return new ICmpInst(ICmpInst::ICMP_SLT, V, ConstantInt::get(V->getType(), 1)); } if (ICI.isEquality() && LHSI->hasOneUse()) { // Simplify icmp eq (trunc x to i8), 42 -> icmp eq x, 42|highbits if all // of the high bits truncated out of x are known. unsigned DstBits = LHSI->getType()->getPrimitiveSizeInBits(), SrcBits = LHSI->getOperand(0)->getType()->getPrimitiveSizeInBits(); APInt KnownZero(SrcBits, 0), KnownOne(SrcBits, 0); computeKnownBits(LHSI->getOperand(0), KnownZero, KnownOne, 0, &ICI); // If all the high bits are known, we can do this xform. if ((KnownZero|KnownOne).countLeadingOnes() >= SrcBits-DstBits) { // Pull in the high bits from known-ones set. APInt NewRHS = RHS->getValue().zext(SrcBits); NewRHS |= KnownOne & APInt::getHighBitsSet(SrcBits, SrcBits-DstBits); return new ICmpInst(ICI.getPredicate(), LHSI->getOperand(0), Builder->getInt(NewRHS)); } } break; case Instruction::Xor: // (icmp pred (xor X, XorCst), CI) if (ConstantInt *XorCst = dyn_cast(LHSI->getOperand(1))) { // If this is a comparison that tests the signbit (X < 0) or (x > -1), // fold the xor. if ((ICI.getPredicate() == ICmpInst::ICMP_SLT && RHSV == 0) || (ICI.getPredicate() == ICmpInst::ICMP_SGT && RHSV.isAllOnesValue())) { Value *CompareVal = LHSI->getOperand(0); // If the sign bit of the XorCst is not set, there is no change to // the operation, just stop using the Xor. if (!XorCst->isNegative()) { ICI.setOperand(0, CompareVal); Worklist.Add(LHSI); return &ICI; } // Was the old condition true if the operand is positive? bool isTrueIfPositive = ICI.getPredicate() == ICmpInst::ICMP_SGT; // If so, the new one isn't. isTrueIfPositive ^= true; if (isTrueIfPositive) return new ICmpInst(ICmpInst::ICMP_SGT, CompareVal, SubOne(RHS)); else return new ICmpInst(ICmpInst::ICMP_SLT, CompareVal, AddOne(RHS)); } if (LHSI->hasOneUse()) { // (icmp u/s (xor A SignBit), C) -> (icmp s/u A, (xor C SignBit)) if (!ICI.isEquality() && XorCst->getValue().isSignBit()) { const APInt &SignBit = XorCst->getValue(); ICmpInst::Predicate Pred = ICI.isSigned() ? ICI.getUnsignedPredicate() : ICI.getSignedPredicate(); return new ICmpInst(Pred, LHSI->getOperand(0), Builder->getInt(RHSV ^ SignBit)); } // (icmp u/s (xor A ~SignBit), C) -> (icmp s/u (xor C ~SignBit), A) if (!ICI.isEquality() && XorCst->isMaxValue(true)) { const APInt &NotSignBit = XorCst->getValue(); ICmpInst::Predicate Pred = ICI.isSigned() ? ICI.getUnsignedPredicate() : ICI.getSignedPredicate(); Pred = ICI.getSwappedPredicate(Pred); return new ICmpInst(Pred, LHSI->getOperand(0), Builder->getInt(RHSV ^ NotSignBit)); } } // (icmp ugt (xor X, C), ~C) -> (icmp ult X, C) // iff -C is a power of 2 if (ICI.getPredicate() == ICmpInst::ICMP_UGT && XorCst->getValue() == ~RHSV && (RHSV + 1).isPowerOf2()) return new ICmpInst(ICmpInst::ICMP_ULT, LHSI->getOperand(0), XorCst); // (icmp ult (xor X, C), -C) -> (icmp uge X, C) // iff -C is a power of 2 if (ICI.getPredicate() == ICmpInst::ICMP_ULT && XorCst->getValue() == -RHSV && RHSV.isPowerOf2()) return new ICmpInst(ICmpInst::ICMP_UGE, LHSI->getOperand(0), XorCst); } break; case Instruction::And: // (icmp pred (and X, AndCst), RHS) if (LHSI->hasOneUse() && isa(LHSI->getOperand(1)) && LHSI->getOperand(0)->hasOneUse()) { ConstantInt *AndCst = cast(LHSI->getOperand(1)); // If the LHS is an AND of a truncating cast, we can widen the // and/compare to be the input width without changing the value // produced, eliminating a cast. if (TruncInst *Cast = dyn_cast(LHSI->getOperand(0))) { // We can do this transformation if either the AND constant does not // have its sign bit set or if it is an equality comparison. // Extending a relational comparison when we're checking the sign // bit would not work. if (ICI.isEquality() || (!AndCst->isNegative() && RHSV.isNonNegative())) { Value *NewAnd = Builder->CreateAnd(Cast->getOperand(0), ConstantExpr::getZExt(AndCst, Cast->getSrcTy())); NewAnd->takeName(LHSI); return new ICmpInst(ICI.getPredicate(), NewAnd, ConstantExpr::getZExt(RHS, Cast->getSrcTy())); } } // If the LHS is an AND of a zext, and we have an equality compare, we can // shrink the and/compare to the smaller type, eliminating the cast. if (ZExtInst *Cast = dyn_cast(LHSI->getOperand(0))) { IntegerType *Ty = cast(Cast->getSrcTy()); // Make sure we don't compare the upper bits, SimplifyDemandedBits // should fold the icmp to true/false in that case. if (ICI.isEquality() && RHSV.getActiveBits() <= Ty->getBitWidth()) { Value *NewAnd = Builder->CreateAnd(Cast->getOperand(0), ConstantExpr::getTrunc(AndCst, Ty)); NewAnd->takeName(LHSI); return new ICmpInst(ICI.getPredicate(), NewAnd, ConstantExpr::getTrunc(RHS, Ty)); } } // If this is: (X >> C1) & C2 != C3 (where any shift and any compare // could exist), turn it into (X & (C2 << C1)) != (C3 << C1). This // happens a LOT in code produced by the C front-end, for bitfield // access. BinaryOperator *Shift = dyn_cast(LHSI->getOperand(0)); if (Shift && !Shift->isShift()) Shift = nullptr; ConstantInt *ShAmt; ShAmt = Shift ? dyn_cast(Shift->getOperand(1)) : nullptr; // This seemingly simple opportunity to fold away a shift turns out to // be rather complicated. See PR17827 // ( http://llvm.org/bugs/show_bug.cgi?id=17827 ) for details. if (ShAmt) { bool CanFold = false; unsigned ShiftOpcode = Shift->getOpcode(); if (ShiftOpcode == Instruction::AShr) { // There may be some constraints that make this possible, // but nothing simple has been discovered yet. CanFold = false; } else if (ShiftOpcode == Instruction::Shl) { // For a left shift, we can fold if the comparison is not signed. // We can also fold a signed comparison if the mask value and // comparison value are not negative. These constraints may not be // obvious, but we can prove that they are correct using an SMT // solver. if (!ICI.isSigned() || (!AndCst->isNegative() && !RHS->isNegative())) CanFold = true; } else if (ShiftOpcode == Instruction::LShr) { // For a logical right shift, we can fold if the comparison is not // signed. We can also fold a signed comparison if the shifted mask // value and the shifted comparison value are not negative. // These constraints may not be obvious, but we can prove that they // are correct using an SMT solver. if (!ICI.isSigned()) CanFold = true; else { ConstantInt *ShiftedAndCst = cast(ConstantExpr::getShl(AndCst, ShAmt)); ConstantInt *ShiftedRHSCst = cast(ConstantExpr::getShl(RHS, ShAmt)); if (!ShiftedAndCst->isNegative() && !ShiftedRHSCst->isNegative()) CanFold = true; } } if (CanFold) { Constant *NewCst; if (ShiftOpcode == Instruction::Shl) NewCst = ConstantExpr::getLShr(RHS, ShAmt); else NewCst = ConstantExpr::getShl(RHS, ShAmt); // Check to see if we are shifting out any of the bits being // compared. if (ConstantExpr::get(ShiftOpcode, NewCst, ShAmt) != RHS) { // If we shifted bits out, the fold is not going to work out. // As a special case, check to see if this means that the // result is always true or false now. if (ICI.getPredicate() == ICmpInst::ICMP_EQ) return replaceInstUsesWith(ICI, Builder->getFalse()); if (ICI.getPredicate() == ICmpInst::ICMP_NE) return replaceInstUsesWith(ICI, Builder->getTrue()); } else { ICI.setOperand(1, NewCst); Constant *NewAndCst; if (ShiftOpcode == Instruction::Shl) NewAndCst = ConstantExpr::getLShr(AndCst, ShAmt); else NewAndCst = ConstantExpr::getShl(AndCst, ShAmt); LHSI->setOperand(1, NewAndCst); LHSI->setOperand(0, Shift->getOperand(0)); Worklist.Add(Shift); // Shift is dead. return &ICI; } } } // Turn ((X >> Y) & C) == 0 into (X & (C << Y)) == 0. The later is // preferable because it allows the C<hasOneUse() && RHSV == 0 && ICI.isEquality() && !Shift->isArithmeticShift() && !isa(Shift->getOperand(0))) { // Compute C << Y. Value *NS; if (Shift->getOpcode() == Instruction::LShr) { NS = Builder->CreateShl(AndCst, Shift->getOperand(1)); } else { // Insert a logical shift. NS = Builder->CreateLShr(AndCst, Shift->getOperand(1)); } // Compute X & (C << Y). Value *NewAnd = Builder->CreateAnd(Shift->getOperand(0), NS, LHSI->getName()); ICI.setOperand(0, NewAnd); return &ICI; } // (icmp pred (and (or (lshr X, Y), X), 1), 0) --> // (icmp pred (and X, (or (shl 1, Y), 1), 0)) // // iff pred isn't signed { Value *X, *Y, *LShr; if (!ICI.isSigned() && RHSV == 0) { if (match(LHSI->getOperand(1), m_One())) { Constant *One = cast(LHSI->getOperand(1)); Value *Or = LHSI->getOperand(0); if (match(Or, m_Or(m_Value(LShr), m_Value(X))) && match(LShr, m_LShr(m_Specific(X), m_Value(Y)))) { unsigned UsesRemoved = 0; if (LHSI->hasOneUse()) ++UsesRemoved; if (Or->hasOneUse()) ++UsesRemoved; if (LShr->hasOneUse()) ++UsesRemoved; Value *NewOr = nullptr; // Compute X & ((1 << Y) | 1) if (auto *C = dyn_cast(Y)) { if (UsesRemoved >= 1) NewOr = ConstantExpr::getOr(ConstantExpr::getNUWShl(One, C), One); } else { if (UsesRemoved >= 3) NewOr = Builder->CreateOr(Builder->CreateShl(One, Y, LShr->getName(), /*HasNUW=*/true), One, Or->getName()); } if (NewOr) { Value *NewAnd = Builder->CreateAnd(X, NewOr, LHSI->getName()); ICI.setOperand(0, NewAnd); return &ICI; } } } } } // Replace ((X & AndCst) > RHSV) with ((X & AndCst) != 0), if any // bit set in (X & AndCst) will produce a result greater than RHSV. if (ICI.getPredicate() == ICmpInst::ICMP_UGT) { unsigned NTZ = AndCst->getValue().countTrailingZeros(); if ((NTZ < AndCst->getBitWidth()) && APInt::getOneBitSet(AndCst->getBitWidth(), NTZ).ugt(RHSV)) return new ICmpInst(ICmpInst::ICMP_NE, LHSI, Constant::getNullValue(RHS->getType())); } } // Try to optimize things like "A[i]&42 == 0" to index computations. if (LoadInst *LI = dyn_cast(LHSI->getOperand(0))) { if (GetElementPtrInst *GEP = dyn_cast(LI->getOperand(0))) if (GlobalVariable *GV = dyn_cast(GEP->getOperand(0))) if (GV->isConstant() && GV->hasDefinitiveInitializer() && !LI->isVolatile() && isa(LHSI->getOperand(1))) { ConstantInt *C = cast(LHSI->getOperand(1)); if (Instruction *Res = FoldCmpLoadFromIndexedGlobal(GEP, GV,ICI, C)) return Res; } } // X & -C == -C -> X > u ~C // X & -C != -C -> X <= u ~C // iff C is a power of 2 if (ICI.isEquality() && RHS == LHSI->getOperand(1) && (-RHSV).isPowerOf2()) return new ICmpInst( ICI.getPredicate() == ICmpInst::ICMP_EQ ? ICmpInst::ICMP_UGT : ICmpInst::ICMP_ULE, LHSI->getOperand(0), SubOne(RHS)); // (icmp eq (and %A, C), 0) -> (icmp sgt (trunc %A), -1) // iff C is a power of 2 if (ICI.isEquality() && LHSI->hasOneUse() && match(RHS, m_Zero())) { if (auto *CI = dyn_cast(LHSI->getOperand(1))) { const APInt &AI = CI->getValue(); int32_t ExactLogBase2 = AI.exactLogBase2(); if (ExactLogBase2 != -1 && DL.isLegalInteger(ExactLogBase2 + 1)) { Type *NTy = IntegerType::get(ICI.getContext(), ExactLogBase2 + 1); Value *Trunc = Builder->CreateTrunc(LHSI->getOperand(0), NTy); return new ICmpInst(ICI.getPredicate() == ICmpInst::ICMP_EQ ? ICmpInst::ICMP_SGE : ICmpInst::ICMP_SLT, Trunc, Constant::getNullValue(NTy)); } } } break; case Instruction::Or: { if (RHS->isOne()) { // icmp slt signum(V) 1 --> icmp slt V, 1 Value *V = nullptr; if (ICI.getPredicate() == ICmpInst::ICMP_SLT && match(LHSI, m_Signum(m_Value(V)))) return new ICmpInst(ICmpInst::ICMP_SLT, V, ConstantInt::get(V->getType(), 1)); } if (!ICI.isEquality() || !RHS->isNullValue() || !LHSI->hasOneUse()) break; Value *P, *Q; if (match(LHSI, m_Or(m_PtrToInt(m_Value(P)), m_PtrToInt(m_Value(Q))))) { // Simplify icmp eq (or (ptrtoint P), (ptrtoint Q)), 0 // -> and (icmp eq P, null), (icmp eq Q, null). Value *ICIP = Builder->CreateICmp(ICI.getPredicate(), P, Constant::getNullValue(P->getType())); Value *ICIQ = Builder->CreateICmp(ICI.getPredicate(), Q, Constant::getNullValue(Q->getType())); Instruction *Op; if (ICI.getPredicate() == ICmpInst::ICMP_EQ) Op = BinaryOperator::CreateAnd(ICIP, ICIQ); else Op = BinaryOperator::CreateOr(ICIP, ICIQ); return Op; } break; } case Instruction::Mul: { // (icmp pred (mul X, Val), CI) ConstantInt *Val = dyn_cast(LHSI->getOperand(1)); if (!Val) break; // If this is a signed comparison to 0 and the mul is sign preserving, // use the mul LHS operand instead. ICmpInst::Predicate pred = ICI.getPredicate(); if (isSignTest(pred, RHS) && !Val->isZero() && cast(LHSI)->hasNoSignedWrap()) return new ICmpInst(Val->isNegative() ? ICmpInst::getSwappedPredicate(pred) : pred, LHSI->getOperand(0), Constant::getNullValue(RHS->getType())); break; } case Instruction::Shl: { // (icmp pred (shl X, ShAmt), CI) uint32_t TypeBits = RHSV.getBitWidth(); ConstantInt *ShAmt = dyn_cast(LHSI->getOperand(1)); if (!ShAmt) { Value *X; // (1 << X) pred P2 -> X pred Log2(P2) if (match(LHSI, m_Shl(m_One(), m_Value(X)))) { bool RHSVIsPowerOf2 = RHSV.isPowerOf2(); ICmpInst::Predicate Pred = ICI.getPredicate(); if (ICI.isUnsigned()) { if (!RHSVIsPowerOf2) { // (1 << X) < 30 -> X <= 4 // (1 << X) <= 30 -> X <= 4 // (1 << X) >= 30 -> X > 4 // (1 << X) > 30 -> X > 4 if (Pred == ICmpInst::ICMP_ULT) Pred = ICmpInst::ICMP_ULE; else if (Pred == ICmpInst::ICMP_UGE) Pred = ICmpInst::ICMP_UGT; } unsigned RHSLog2 = RHSV.logBase2(); // (1 << X) >= 2147483648 -> X >= 31 -> X == 31 // (1 << X) < 2147483648 -> X < 31 -> X != 31 if (RHSLog2 == TypeBits-1) { if (Pred == ICmpInst::ICMP_UGE) Pred = ICmpInst::ICMP_EQ; else if (Pred == ICmpInst::ICMP_ULT) Pred = ICmpInst::ICMP_NE; } return new ICmpInst(Pred, X, ConstantInt::get(RHS->getType(), RHSLog2)); } else if (ICI.isSigned()) { if (RHSV.isAllOnesValue()) { // (1 << X) <= -1 -> X == 31 if (Pred == ICmpInst::ICMP_SLE) return new ICmpInst(ICmpInst::ICMP_EQ, X, ConstantInt::get(RHS->getType(), TypeBits-1)); // (1 << X) > -1 -> X != 31 if (Pred == ICmpInst::ICMP_SGT) return new ICmpInst(ICmpInst::ICMP_NE, X, ConstantInt::get(RHS->getType(), TypeBits-1)); } else if (!RHSV) { // (1 << X) < 0 -> X == 31 // (1 << X) <= 0 -> X == 31 if (Pred == ICmpInst::ICMP_SLT || Pred == ICmpInst::ICMP_SLE) return new ICmpInst(ICmpInst::ICMP_EQ, X, ConstantInt::get(RHS->getType(), TypeBits-1)); // (1 << X) >= 0 -> X != 31 // (1 << X) > 0 -> X != 31 if (Pred == ICmpInst::ICMP_SGT || Pred == ICmpInst::ICMP_SGE) return new ICmpInst(ICmpInst::ICMP_NE, X, ConstantInt::get(RHS->getType(), TypeBits-1)); } } else if (ICI.isEquality()) { if (RHSVIsPowerOf2) return new ICmpInst( Pred, X, ConstantInt::get(RHS->getType(), RHSV.logBase2())); } } break; } // Check that the shift amount is in range. If not, don't perform // undefined shifts. When the shift is visited it will be // simplified. if (ShAmt->uge(TypeBits)) break; if (ICI.isEquality()) { // If we are comparing against bits always shifted out, the // comparison cannot succeed. Constant *Comp = ConstantExpr::getShl(ConstantExpr::getLShr(RHS, ShAmt), ShAmt); if (Comp != RHS) {// Comparing against a bit that we know is zero. bool IsICMP_NE = ICI.getPredicate() == ICmpInst::ICMP_NE; Constant *Cst = Builder->getInt1(IsICMP_NE); return replaceInstUsesWith(ICI, Cst); } // If the shift is NUW, then it is just shifting out zeros, no need for an // AND. if (cast(LHSI)->hasNoUnsignedWrap()) return new ICmpInst(ICI.getPredicate(), LHSI->getOperand(0), ConstantExpr::getLShr(RHS, ShAmt)); // If the shift is NSW and we compare to 0, then it is just shifting out // sign bits, no need for an AND either. if (cast(LHSI)->hasNoSignedWrap() && RHSV == 0) return new ICmpInst(ICI.getPredicate(), LHSI->getOperand(0), ConstantExpr::getLShr(RHS, ShAmt)); if (LHSI->hasOneUse()) { // Otherwise strength reduce the shift into an and. uint32_t ShAmtVal = (uint32_t)ShAmt->getLimitedValue(TypeBits); Constant *Mask = Builder->getInt(APInt::getLowBitsSet(TypeBits, TypeBits - ShAmtVal)); Value *And = Builder->CreateAnd(LHSI->getOperand(0),Mask, LHSI->getName()+".mask"); return new ICmpInst(ICI.getPredicate(), And, ConstantExpr::getLShr(RHS, ShAmt)); } } // If this is a signed comparison to 0 and the shift is sign preserving, // use the shift LHS operand instead. ICmpInst::Predicate pred = ICI.getPredicate(); if (isSignTest(pred, RHS) && cast(LHSI)->hasNoSignedWrap()) return new ICmpInst(pred, LHSI->getOperand(0), Constant::getNullValue(RHS->getType())); // Otherwise, if this is a comparison of the sign bit, simplify to and/test. bool TrueIfSigned = false; if (LHSI->hasOneUse() && isSignBitCheck(ICI.getPredicate(), RHS, TrueIfSigned)) { // (X << 31) (X&1) != 0 Constant *Mask = ConstantInt::get(LHSI->getOperand(0)->getType(), APInt::getOneBitSet(TypeBits, TypeBits-ShAmt->getZExtValue()-1)); Value *And = Builder->CreateAnd(LHSI->getOperand(0), Mask, LHSI->getName()+".mask"); return new ICmpInst(TrueIfSigned ? ICmpInst::ICMP_NE : ICmpInst::ICMP_EQ, And, Constant::getNullValue(And->getType())); } // Transform (icmp pred iM (shl iM %v, N), CI) // -> (icmp pred i(M-N) (trunc %v iM to i(M-N)), (trunc (CI>>N)) // Transform the shl to a trunc if (trunc (CI>>N)) has no loss and M-N. // This enables to get rid of the shift in favor of a trunc which can be // free on the target. It has the additional benefit of comparing to a // smaller constant, which will be target friendly. unsigned Amt = ShAmt->getLimitedValue(TypeBits-1); if (LHSI->hasOneUse() && Amt != 0 && RHSV.countTrailingZeros() >= Amt) { Type *NTy = IntegerType::get(ICI.getContext(), TypeBits - Amt); Constant *NCI = ConstantExpr::getTrunc( ConstantExpr::getAShr(RHS, ConstantInt::get(RHS->getType(), Amt)), NTy); return new ICmpInst(ICI.getPredicate(), Builder->CreateTrunc(LHSI->getOperand(0), NTy), NCI); } break; } case Instruction::LShr: // (icmp pred (shr X, ShAmt), CI) case Instruction::AShr: { // Handle equality comparisons of shift-by-constant. BinaryOperator *BO = cast(LHSI); if (ConstantInt *ShAmt = dyn_cast(LHSI->getOperand(1))) { if (Instruction *Res = FoldICmpShrCst(ICI, BO, ShAmt)) return Res; } // Handle exact shr's. if (ICI.isEquality() && BO->isExact() && BO->hasOneUse()) { if (RHSV.isMinValue()) return new ICmpInst(ICI.getPredicate(), BO->getOperand(0), RHS); } break; } case Instruction::UDiv: if (ConstantInt *DivLHS = dyn_cast(LHSI->getOperand(0))) { Value *X = LHSI->getOperand(1); const APInt &C1 = RHS->getValue(); const APInt &C2 = DivLHS->getValue(); assert(C2 != 0 && "udiv 0, X should have been simplified already."); // (icmp ugt (udiv C2, X), C1) -> (icmp ule X, C2/(C1+1)) if (ICI.getPredicate() == ICmpInst::ICMP_UGT) { assert(!C1.isMaxValue() && "icmp ugt X, UINT_MAX should have been simplified already."); return new ICmpInst(ICmpInst::ICMP_ULE, X, ConstantInt::get(X->getType(), C2.udiv(C1 + 1))); } // (icmp ult (udiv C2, X), C1) -> (icmp ugt X, C2/C1) if (ICI.getPredicate() == ICmpInst::ICMP_ULT) { assert(C1 != 0 && "icmp ult X, 0 should have been simplified already."); return new ICmpInst(ICmpInst::ICMP_UGT, X, ConstantInt::get(X->getType(), C2.udiv(C1))); } } // fall-through case Instruction::SDiv: // Fold: icmp pred ([us]div X, C1), C2 -> range test // Fold this div into the comparison, producing a range check. // Determine, based on the divide type, what the range is being // checked. If there is an overflow on the low or high side, remember // it, otherwise compute the range [low, hi) bounding the new value. // See: InsertRangeTest above for the kinds of replacements possible. if (ConstantInt *DivRHS = dyn_cast(LHSI->getOperand(1))) if (Instruction *R = FoldICmpDivCst(ICI, cast(LHSI), DivRHS)) return R; break; case Instruction::Sub: { ConstantInt *LHSC = dyn_cast(LHSI->getOperand(0)); if (!LHSC) break; const APInt &LHSV = LHSC->getValue(); // C1-X (X|(C2-1)) == C1 // iff C1 & (C2-1) == C2-1 // C2 is a power of 2 if (ICI.getPredicate() == ICmpInst::ICMP_ULT && LHSI->hasOneUse() && RHSV.isPowerOf2() && (LHSV & (RHSV - 1)) == (RHSV - 1)) return new ICmpInst(ICmpInst::ICMP_EQ, Builder->CreateOr(LHSI->getOperand(1), RHSV - 1), LHSC); // C1-X >u C2 -> (X|C2) != C1 // iff C1 & C2 == C2 // C2+1 is a power of 2 if (ICI.getPredicate() == ICmpInst::ICMP_UGT && LHSI->hasOneUse() && (RHSV + 1).isPowerOf2() && (LHSV & RHSV) == RHSV) return new ICmpInst(ICmpInst::ICMP_NE, Builder->CreateOr(LHSI->getOperand(1), RHSV), LHSC); break; } case Instruction::Add: // Fold: icmp pred (add X, C1), C2 if (!ICI.isEquality()) { ConstantInt *LHSC = dyn_cast(LHSI->getOperand(1)); if (!LHSC) break; const APInt &LHSV = LHSC->getValue(); ConstantRange CR = ICI.makeConstantRange(ICI.getPredicate(), RHSV) .subtract(LHSV); if (ICI.isSigned()) { if (CR.getLower().isSignBit()) { return new ICmpInst(ICmpInst::ICMP_SLT, LHSI->getOperand(0), Builder->getInt(CR.getUpper())); } else if (CR.getUpper().isSignBit()) { return new ICmpInst(ICmpInst::ICMP_SGE, LHSI->getOperand(0), Builder->getInt(CR.getLower())); } } else { if (CR.getLower().isMinValue()) { return new ICmpInst(ICmpInst::ICMP_ULT, LHSI->getOperand(0), Builder->getInt(CR.getUpper())); } else if (CR.getUpper().isMinValue()) { return new ICmpInst(ICmpInst::ICMP_UGE, LHSI->getOperand(0), Builder->getInt(CR.getLower())); } } // X-C1 (X & -C2) == C1 // iff C1 & (C2-1) == 0 // C2 is a power of 2 if (ICI.getPredicate() == ICmpInst::ICMP_ULT && LHSI->hasOneUse() && RHSV.isPowerOf2() && (LHSV & (RHSV - 1)) == 0) return new ICmpInst(ICmpInst::ICMP_EQ, Builder->CreateAnd(LHSI->getOperand(0), -RHSV), ConstantExpr::getNeg(LHSC)); // X-C1 >u C2 -> (X & ~C2) != C1 // iff C1 & C2 == 0 // C2+1 is a power of 2 if (ICI.getPredicate() == ICmpInst::ICMP_UGT && LHSI->hasOneUse() && (RHSV + 1).isPowerOf2() && (LHSV & RHSV) == 0) return new ICmpInst(ICmpInst::ICMP_NE, Builder->CreateAnd(LHSI->getOperand(0), ~RHSV), ConstantExpr::getNeg(LHSC)); } break; } // Simplify icmp_eq and icmp_ne instructions with integer constant RHS. if (ICI.isEquality()) { bool isICMP_NE = ICI.getPredicate() == ICmpInst::ICMP_NE; // If the first operand is (add|sub|and|or|xor|rem) with a constant, and // the second operand is a constant, simplify a bit. if (BinaryOperator *BO = dyn_cast(LHSI)) { switch (BO->getOpcode()) { case Instruction::SRem: // If we have a signed (X % (2^c)) == 0, turn it into an unsigned one. if (RHSV == 0 && isa(BO->getOperand(1)) &&BO->hasOneUse()){ const APInt &V = cast(BO->getOperand(1))->getValue(); if (V.sgt(1) && V.isPowerOf2()) { Value *NewRem = Builder->CreateURem(BO->getOperand(0), BO->getOperand(1), BO->getName()); return new ICmpInst(ICI.getPredicate(), NewRem, Constant::getNullValue(BO->getType())); } } break; case Instruction::Add: // Replace ((add A, B) != C) with (A != C-B) if B & C are constants. if (ConstantInt *BOp1C = dyn_cast(BO->getOperand(1))) { if (BO->hasOneUse()) return new ICmpInst(ICI.getPredicate(), BO->getOperand(0), ConstantExpr::getSub(RHS, BOp1C)); } else if (RHSV == 0) { // Replace ((add A, B) != 0) with (A != -B) if A or B is // efficiently invertible, or if the add has just this one use. Value *BOp0 = BO->getOperand(0), *BOp1 = BO->getOperand(1); if (Value *NegVal = dyn_castNegVal(BOp1)) return new ICmpInst(ICI.getPredicate(), BOp0, NegVal); if (Value *NegVal = dyn_castNegVal(BOp0)) return new ICmpInst(ICI.getPredicate(), NegVal, BOp1); if (BO->hasOneUse()) { Value *Neg = Builder->CreateNeg(BOp1); Neg->takeName(BO); return new ICmpInst(ICI.getPredicate(), BOp0, Neg); } } break; case Instruction::Xor: if (BO->hasOneUse()) { if (Constant *BOC = dyn_cast(BO->getOperand(1))) { // For the xor case, we can xor two constants together, eliminating // the explicit xor. return new ICmpInst(ICI.getPredicate(), BO->getOperand(0), ConstantExpr::getXor(RHS, BOC)); } else if (RHSV == 0) { // Replace ((xor A, B) != 0) with (A != B) return new ICmpInst(ICI.getPredicate(), BO->getOperand(0), BO->getOperand(1)); } } break; case Instruction::Sub: if (BO->hasOneUse()) { if (ConstantInt *BOp0C = dyn_cast(BO->getOperand(0))) { // Replace ((sub A, B) != C) with (B != A-C) if A & C are constants. return new ICmpInst(ICI.getPredicate(), BO->getOperand(1), ConstantExpr::getSub(BOp0C, RHS)); } else if (RHSV == 0) { // Replace ((sub A, B) != 0) with (A != B) return new ICmpInst(ICI.getPredicate(), BO->getOperand(0), BO->getOperand(1)); } } break; case Instruction::Or: // If bits are being or'd in that are not present in the constant we // are comparing against, then the comparison could never succeed! if (ConstantInt *BOC = dyn_cast(BO->getOperand(1))) { Constant *NotCI = ConstantExpr::getNot(RHS); if (!ConstantExpr::getAnd(BOC, NotCI)->isNullValue()) return replaceInstUsesWith(ICI, Builder->getInt1(isICMP_NE)); // Comparing if all bits outside of a constant mask are set? // Replace (X | C) == -1 with (X & ~C) == ~C. // This removes the -1 constant. if (BO->hasOneUse() && RHS->isAllOnesValue()) { Constant *NotBOC = ConstantExpr::getNot(BOC); Value *And = Builder->CreateAnd(BO->getOperand(0), NotBOC); return new ICmpInst(ICI.getPredicate(), And, NotBOC); } } break; case Instruction::And: if (ConstantInt *BOC = dyn_cast(BO->getOperand(1))) { // If bits are being compared against that are and'd out, then the // comparison can never succeed! if ((RHSV & ~BOC->getValue()) != 0) return replaceInstUsesWith(ICI, Builder->getInt1(isICMP_NE)); // If we have ((X & C) == C), turn it into ((X & C) != 0). if (RHS == BOC && RHSV.isPowerOf2()) return new ICmpInst(isICMP_NE ? ICmpInst::ICMP_EQ : ICmpInst::ICMP_NE, LHSI, Constant::getNullValue(RHS->getType())); // Don't perform the following transforms if the AND has multiple uses if (!BO->hasOneUse()) break; // Replace (and X, (1 << size(X)-1) != 0) with x s< 0 if (BOC->getValue().isSignBit()) { Value *X = BO->getOperand(0); Constant *Zero = Constant::getNullValue(X->getType()); ICmpInst::Predicate pred = isICMP_NE ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_SGE; return new ICmpInst(pred, X, Zero); } // ((X & ~7) == 0) --> X < 8 if (RHSV == 0 && isHighOnes(BOC)) { Value *X = BO->getOperand(0); Constant *NegX = ConstantExpr::getNeg(BOC); ICmpInst::Predicate pred = isICMP_NE ? ICmpInst::ICMP_UGE : ICmpInst::ICMP_ULT; return new ICmpInst(pred, X, NegX); } } break; case Instruction::Mul: if (RHSV == 0 && BO->hasNoSignedWrap()) { if (ConstantInt *BOC = dyn_cast(BO->getOperand(1))) { // The trivial case (mul X, 0) is handled by InstSimplify // General case : (mul X, C) != 0 iff X != 0 // (mul X, C) == 0 iff X == 0 if (!BOC->isZero()) return new ICmpInst(ICI.getPredicate(), BO->getOperand(0), Constant::getNullValue(RHS->getType())); } } break; default: break; } } else if (IntrinsicInst *II = dyn_cast(LHSI)) { // Handle icmp {eq|ne} , intcst. switch (II->getIntrinsicID()) { case Intrinsic::bswap: Worklist.Add(II); ICI.setOperand(0, II->getArgOperand(0)); ICI.setOperand(1, Builder->getInt(RHSV.byteSwap())); return &ICI; case Intrinsic::ctlz: case Intrinsic::cttz: // ctz(A) == bitwidth(a) -> A == 0 and likewise for != if (RHSV == RHS->getType()->getBitWidth()) { Worklist.Add(II); ICI.setOperand(0, II->getArgOperand(0)); ICI.setOperand(1, ConstantInt::get(RHS->getType(), 0)); return &ICI; } break; case Intrinsic::ctpop: // popcount(A) == 0 -> A == 0 and likewise for != if (RHS->isZero()) { Worklist.Add(II); ICI.setOperand(0, II->getArgOperand(0)); ICI.setOperand(1, RHS); return &ICI; } break; default: break; } } } return nullptr; } /// Handle icmp (cast x to y), (cast/cst). We only handle extending casts so /// far. Instruction *InstCombiner::visitICmpInstWithCastAndCast(ICmpInst &ICmp) { const CastInst *LHSCI = cast(ICmp.getOperand(0)); Value *LHSCIOp = LHSCI->getOperand(0); Type *SrcTy = LHSCIOp->getType(); Type *DestTy = LHSCI->getType(); Value *RHSCIOp; // Turn icmp (ptrtoint x), (ptrtoint/c) into a compare of the input if the // integer type is the same size as the pointer type. if (LHSCI->getOpcode() == Instruction::PtrToInt && DL.getPointerTypeSizeInBits(SrcTy) == DestTy->getIntegerBitWidth()) { Value *RHSOp = nullptr; if (auto *RHSC = dyn_cast(ICmp.getOperand(1))) { Value *RHSCIOp = RHSC->getOperand(0); if (RHSCIOp->getType()->getPointerAddressSpace() == LHSCIOp->getType()->getPointerAddressSpace()) { RHSOp = RHSC->getOperand(0); // If the pointer types don't match, insert a bitcast. if (LHSCIOp->getType() != RHSOp->getType()) RHSOp = Builder->CreateBitCast(RHSOp, LHSCIOp->getType()); } } else if (auto *RHSC = dyn_cast(ICmp.getOperand(1))) { RHSOp = ConstantExpr::getIntToPtr(RHSC, SrcTy); } if (RHSOp) return new ICmpInst(ICmp.getPredicate(), LHSCIOp, RHSOp); } // The code below only handles extension cast instructions, so far. // Enforce this. if (LHSCI->getOpcode() != Instruction::ZExt && LHSCI->getOpcode() != Instruction::SExt) return nullptr; bool isSignedExt = LHSCI->getOpcode() == Instruction::SExt; bool isSignedCmp = ICmp.isSigned(); if (auto *CI = dyn_cast(ICmp.getOperand(1))) { // Not an extension from the same type? RHSCIOp = CI->getOperand(0); if (RHSCIOp->getType() != LHSCIOp->getType()) return nullptr; // If the signedness of the two casts doesn't agree (i.e. one is a sext // and the other is a zext), then we can't handle this. if (CI->getOpcode() != LHSCI->getOpcode()) return nullptr; // Deal with equality cases early. if (ICmp.isEquality()) return new ICmpInst(ICmp.getPredicate(), LHSCIOp, RHSCIOp); // A signed comparison of sign extended values simplifies into a // signed comparison. if (isSignedCmp && isSignedExt) return new ICmpInst(ICmp.getPredicate(), LHSCIOp, RHSCIOp); // The other three cases all fold into an unsigned comparison. return new ICmpInst(ICmp.getUnsignedPredicate(), LHSCIOp, RHSCIOp); } // If we aren't dealing with a constant on the RHS, exit early. auto *C = dyn_cast(ICmp.getOperand(1)); if (!C) return nullptr; // Compute the constant that would happen if we truncated to SrcTy then // re-extended to DestTy. Constant *Res1 = ConstantExpr::getTrunc(C, SrcTy); Constant *Res2 = ConstantExpr::getCast(LHSCI->getOpcode(), Res1, DestTy); // If the re-extended constant didn't change... if (Res2 == C) { // Deal with equality cases early. if (ICmp.isEquality()) return new ICmpInst(ICmp.getPredicate(), LHSCIOp, Res1); // A signed comparison of sign extended values simplifies into a // signed comparison. if (isSignedExt && isSignedCmp) return new ICmpInst(ICmp.getPredicate(), LHSCIOp, Res1); // The other three cases all fold into an unsigned comparison. return new ICmpInst(ICmp.getUnsignedPredicate(), LHSCIOp, Res1); } // The re-extended constant changed, partly changed (in the case of a vector), // or could not be determined to be equal (in the case of a constant // expression), so the constant cannot be represented in the shorter type. // Consequently, we cannot emit a simple comparison. // All the cases that fold to true or false will have already been handled // by SimplifyICmpInst, so only deal with the tricky case. if (isSignedCmp || !isSignedExt || !isa(C)) return nullptr; // Evaluate the comparison for LT (we invert for GT below). LE and GE cases // should have been folded away previously and not enter in here. // We're performing an unsigned comp with a sign extended value. // This is true if the input is >= 0. [aka >s -1] Constant *NegOne = Constant::getAllOnesValue(SrcTy); Value *Result = Builder->CreateICmpSGT(LHSCIOp, NegOne, ICmp.getName()); // Finally, return the value computed. if (ICmp.getPredicate() == ICmpInst::ICMP_ULT) return replaceInstUsesWith(ICmp, Result); assert(ICmp.getPredicate() == ICmpInst::ICMP_UGT && "ICmp should be folded!"); return BinaryOperator::CreateNot(Result); } /// The caller has matched a pattern of the form: /// I = icmp ugt (add (add A, B), CI2), CI1 /// If this is of the form: /// sum = a + b /// if (sum+128 >u 255) /// Then replace it with llvm.sadd.with.overflow.i8. /// static Instruction *ProcessUGT_ADDCST_ADD(ICmpInst &I, Value *A, Value *B, ConstantInt *CI2, ConstantInt *CI1, InstCombiner &IC) { // The transformation we're trying to do here is to transform this into an // llvm.sadd.with.overflow. To do this, we have to replace the original add // with a narrower add, and discard the add-with-constant that is part of the // range check (if we can't eliminate it, this isn't profitable). // In order to eliminate the add-with-constant, the compare can be its only // use. Instruction *AddWithCst = cast(I.getOperand(0)); if (!AddWithCst->hasOneUse()) return nullptr; // If CI2 is 2^7, 2^15, 2^31, then it might be an sadd.with.overflow. if (!CI2->getValue().isPowerOf2()) return nullptr; unsigned NewWidth = CI2->getValue().countTrailingZeros(); if (NewWidth != 7 && NewWidth != 15 && NewWidth != 31) return nullptr; // The width of the new add formed is 1 more than the bias. ++NewWidth; // Check to see that CI1 is an all-ones value with NewWidth bits. if (CI1->getBitWidth() == NewWidth || CI1->getValue() != APInt::getLowBitsSet(CI1->getBitWidth(), NewWidth)) return nullptr; // This is only really a signed overflow check if the inputs have been // sign-extended; check for that condition. For example, if CI2 is 2^31 and // the operands of the add are 64 bits wide, we need at least 33 sign bits. unsigned NeededSignBits = CI1->getBitWidth() - NewWidth + 1; if (IC.ComputeNumSignBits(A, 0, &I) < NeededSignBits || IC.ComputeNumSignBits(B, 0, &I) < NeededSignBits) return nullptr; // In order to replace the original add with a narrower // llvm.sadd.with.overflow, the only uses allowed are the add-with-constant // and truncates that discard the high bits of the add. Verify that this is // the case. Instruction *OrigAdd = cast(AddWithCst->getOperand(0)); for (User *U : OrigAdd->users()) { if (U == AddWithCst) continue; // Only accept truncates for now. We would really like a nice recursive // predicate like SimplifyDemandedBits, but which goes downwards the use-def // chain to see which bits of a value are actually demanded. If the // original add had another add which was then immediately truncated, we // could still do the transformation. TruncInst *TI = dyn_cast(U); if (!TI || TI->getType()->getPrimitiveSizeInBits() > NewWidth) return nullptr; } // If the pattern matches, truncate the inputs to the narrower type and // use the sadd_with_overflow intrinsic to efficiently compute both the // result and the overflow bit. Type *NewType = IntegerType::get(OrigAdd->getContext(), NewWidth); Value *F = Intrinsic::getDeclaration(I.getModule(), Intrinsic::sadd_with_overflow, NewType); InstCombiner::BuilderTy *Builder = IC.Builder; // Put the new code above the original add, in case there are any uses of the // add between the add and the compare. Builder->SetInsertPoint(OrigAdd); Value *TruncA = Builder->CreateTrunc(A, NewType, A->getName()+".trunc"); Value *TruncB = Builder->CreateTrunc(B, NewType, B->getName()+".trunc"); CallInst *Call = Builder->CreateCall(F, {TruncA, TruncB}, "sadd"); Value *Add = Builder->CreateExtractValue(Call, 0, "sadd.result"); Value *ZExt = Builder->CreateZExt(Add, OrigAdd->getType()); // The inner add was the result of the narrow add, zero extended to the // wider type. Replace it with the result computed by the intrinsic. IC.replaceInstUsesWith(*OrigAdd, ZExt); // The original icmp gets replaced with the overflow value. return ExtractValueInst::Create(Call, 1, "sadd.overflow"); } bool InstCombiner::OptimizeOverflowCheck(OverflowCheckFlavor OCF, Value *LHS, Value *RHS, Instruction &OrigI, Value *&Result, Constant *&Overflow) { if (OrigI.isCommutative() && isa(LHS) && !isa(RHS)) std::swap(LHS, RHS); auto SetResult = [&](Value *OpResult, Constant *OverflowVal, bool ReuseName) { Result = OpResult; Overflow = OverflowVal; if (ReuseName) Result->takeName(&OrigI); return true; }; // If the overflow check was an add followed by a compare, the insertion point // may be pointing to the compare. We want to insert the new instructions // before the add in case there are uses of the add between the add and the // compare. Builder->SetInsertPoint(&OrigI); switch (OCF) { case OCF_INVALID: llvm_unreachable("bad overflow check kind!"); case OCF_UNSIGNED_ADD: { OverflowResult OR = computeOverflowForUnsignedAdd(LHS, RHS, &OrigI); if (OR == OverflowResult::NeverOverflows) return SetResult(Builder->CreateNUWAdd(LHS, RHS), Builder->getFalse(), true); if (OR == OverflowResult::AlwaysOverflows) return SetResult(Builder->CreateAdd(LHS, RHS), Builder->getTrue(), true); } // FALL THROUGH uadd into sadd case OCF_SIGNED_ADD: { // X + 0 -> {X, false} if (match(RHS, m_Zero())) return SetResult(LHS, Builder->getFalse(), false); // We can strength reduce this signed add into a regular add if we can prove // that it will never overflow. if (OCF == OCF_SIGNED_ADD) if (WillNotOverflowSignedAdd(LHS, RHS, OrigI)) return SetResult(Builder->CreateNSWAdd(LHS, RHS), Builder->getFalse(), true); break; } case OCF_UNSIGNED_SUB: case OCF_SIGNED_SUB: { // X - 0 -> {X, false} if (match(RHS, m_Zero())) return SetResult(LHS, Builder->getFalse(), false); if (OCF == OCF_SIGNED_SUB) { if (WillNotOverflowSignedSub(LHS, RHS, OrigI)) return SetResult(Builder->CreateNSWSub(LHS, RHS), Builder->getFalse(), true); } else { if (WillNotOverflowUnsignedSub(LHS, RHS, OrigI)) return SetResult(Builder->CreateNUWSub(LHS, RHS), Builder->getFalse(), true); } break; } case OCF_UNSIGNED_MUL: { OverflowResult OR = computeOverflowForUnsignedMul(LHS, RHS, &OrigI); if (OR == OverflowResult::NeverOverflows) return SetResult(Builder->CreateNUWMul(LHS, RHS), Builder->getFalse(), true); if (OR == OverflowResult::AlwaysOverflows) return SetResult(Builder->CreateMul(LHS, RHS), Builder->getTrue(), true); } // FALL THROUGH case OCF_SIGNED_MUL: // X * undef -> undef if (isa(RHS)) return SetResult(RHS, UndefValue::get(Builder->getInt1Ty()), false); // X * 0 -> {0, false} if (match(RHS, m_Zero())) return SetResult(RHS, Builder->getFalse(), false); // X * 1 -> {X, false} if (match(RHS, m_One())) return SetResult(LHS, Builder->getFalse(), false); if (OCF == OCF_SIGNED_MUL) if (WillNotOverflowSignedMul(LHS, RHS, OrigI)) return SetResult(Builder->CreateNSWMul(LHS, RHS), Builder->getFalse(), true); break; } return false; } /// \brief Recognize and process idiom involving test for multiplication /// overflow. /// /// The caller has matched a pattern of the form: /// I = cmp u (mul(zext A, zext B), V /// The function checks if this is a test for overflow and if so replaces /// multiplication with call to 'mul.with.overflow' intrinsic. /// /// \param I Compare instruction. /// \param MulVal Result of 'mult' instruction. It is one of the arguments of /// the compare instruction. Must be of integer type. /// \param OtherVal The other argument of compare instruction. /// \returns Instruction which must replace the compare instruction, NULL if no /// replacement required. static Instruction *ProcessUMulZExtIdiom(ICmpInst &I, Value *MulVal, Value *OtherVal, InstCombiner &IC) { // Don't bother doing this transformation for pointers, don't do it for // vectors. if (!isa(MulVal->getType())) return nullptr; assert(I.getOperand(0) == MulVal || I.getOperand(1) == MulVal); assert(I.getOperand(0) == OtherVal || I.getOperand(1) == OtherVal); auto *MulInstr = dyn_cast(MulVal); if (!MulInstr) return nullptr; assert(MulInstr->getOpcode() == Instruction::Mul); auto *LHS = cast(MulInstr->getOperand(0)), *RHS = cast(MulInstr->getOperand(1)); assert(LHS->getOpcode() == Instruction::ZExt); assert(RHS->getOpcode() == Instruction::ZExt); Value *A = LHS->getOperand(0), *B = RHS->getOperand(0); // Calculate type and width of the result produced by mul.with.overflow. Type *TyA = A->getType(), *TyB = B->getType(); unsigned WidthA = TyA->getPrimitiveSizeInBits(), WidthB = TyB->getPrimitiveSizeInBits(); unsigned MulWidth; Type *MulType; if (WidthB > WidthA) { MulWidth = WidthB; MulType = TyB; } else { MulWidth = WidthA; MulType = TyA; } // In order to replace the original mul with a narrower mul.with.overflow, // all uses must ignore upper bits of the product. The number of used low // bits must be not greater than the width of mul.with.overflow. if (MulVal->hasNUsesOrMore(2)) for (User *U : MulVal->users()) { if (U == &I) continue; if (TruncInst *TI = dyn_cast(U)) { // Check if truncation ignores bits above MulWidth. unsigned TruncWidth = TI->getType()->getPrimitiveSizeInBits(); if (TruncWidth > MulWidth) return nullptr; } else if (BinaryOperator *BO = dyn_cast(U)) { // Check if AND ignores bits above MulWidth. if (BO->getOpcode() != Instruction::And) return nullptr; if (ConstantInt *CI = dyn_cast(BO->getOperand(1))) { const APInt &CVal = CI->getValue(); if (CVal.getBitWidth() - CVal.countLeadingZeros() > MulWidth) return nullptr; } } else { // Other uses prohibit this transformation. return nullptr; } } // Recognize patterns switch (I.getPredicate()) { case ICmpInst::ICMP_EQ: case ICmpInst::ICMP_NE: // Recognize pattern: // mulval = mul(zext A, zext B) // cmp eq/neq mulval, zext trunc mulval if (ZExtInst *Zext = dyn_cast(OtherVal)) if (Zext->hasOneUse()) { Value *ZextArg = Zext->getOperand(0); if (TruncInst *Trunc = dyn_cast(ZextArg)) if (Trunc->getType()->getPrimitiveSizeInBits() == MulWidth) break; //Recognized } // Recognize pattern: // mulval = mul(zext A, zext B) // cmp eq/neq mulval, and(mulval, mask), mask selects low MulWidth bits. ConstantInt *CI; Value *ValToMask; if (match(OtherVal, m_And(m_Value(ValToMask), m_ConstantInt(CI)))) { if (ValToMask != MulVal) return nullptr; const APInt &CVal = CI->getValue() + 1; if (CVal.isPowerOf2()) { unsigned MaskWidth = CVal.logBase2(); if (MaskWidth == MulWidth) break; // Recognized } } return nullptr; case ICmpInst::ICMP_UGT: // Recognize pattern: // mulval = mul(zext A, zext B) // cmp ugt mulval, max if (ConstantInt *CI = dyn_cast(OtherVal)) { APInt MaxVal = APInt::getMaxValue(MulWidth); MaxVal = MaxVal.zext(CI->getBitWidth()); if (MaxVal.eq(CI->getValue())) break; // Recognized } return nullptr; case ICmpInst::ICMP_UGE: // Recognize pattern: // mulval = mul(zext A, zext B) // cmp uge mulval, max+1 if (ConstantInt *CI = dyn_cast(OtherVal)) { APInt MaxVal = APInt::getOneBitSet(CI->getBitWidth(), MulWidth); if (MaxVal.eq(CI->getValue())) break; // Recognized } return nullptr; case ICmpInst::ICMP_ULE: // Recognize pattern: // mulval = mul(zext A, zext B) // cmp ule mulval, max if (ConstantInt *CI = dyn_cast(OtherVal)) { APInt MaxVal = APInt::getMaxValue(MulWidth); MaxVal = MaxVal.zext(CI->getBitWidth()); if (MaxVal.eq(CI->getValue())) break; // Recognized } return nullptr; case ICmpInst::ICMP_ULT: // Recognize pattern: // mulval = mul(zext A, zext B) // cmp ule mulval, max + 1 if (ConstantInt *CI = dyn_cast(OtherVal)) { APInt MaxVal = APInt::getOneBitSet(CI->getBitWidth(), MulWidth); if (MaxVal.eq(CI->getValue())) break; // Recognized } return nullptr; default: return nullptr; } InstCombiner::BuilderTy *Builder = IC.Builder; Builder->SetInsertPoint(MulInstr); // Replace: mul(zext A, zext B) --> mul.with.overflow(A, B) Value *MulA = A, *MulB = B; if (WidthA < MulWidth) MulA = Builder->CreateZExt(A, MulType); if (WidthB < MulWidth) MulB = Builder->CreateZExt(B, MulType); Value *F = Intrinsic::getDeclaration(I.getModule(), Intrinsic::umul_with_overflow, MulType); CallInst *Call = Builder->CreateCall(F, {MulA, MulB}, "umul"); IC.Worklist.Add(MulInstr); // If there are uses of mul result other than the comparison, we know that // they are truncation or binary AND. Change them to use result of // mul.with.overflow and adjust properly mask/size. if (MulVal->hasNUsesOrMore(2)) { Value *Mul = Builder->CreateExtractValue(Call, 0, "umul.value"); for (User *U : MulVal->users()) { if (U == &I || U == OtherVal) continue; if (TruncInst *TI = dyn_cast(U)) { if (TI->getType()->getPrimitiveSizeInBits() == MulWidth) IC.replaceInstUsesWith(*TI, Mul); else TI->setOperand(0, Mul); } else if (BinaryOperator *BO = dyn_cast(U)) { assert(BO->getOpcode() == Instruction::And); // Replace (mul & mask) --> zext (mul.with.overflow & short_mask) ConstantInt *CI = cast(BO->getOperand(1)); APInt ShortMask = CI->getValue().trunc(MulWidth); Value *ShortAnd = Builder->CreateAnd(Mul, ShortMask); Instruction *Zext = cast(Builder->CreateZExt(ShortAnd, BO->getType())); IC.Worklist.Add(Zext); IC.replaceInstUsesWith(*BO, Zext); } else { llvm_unreachable("Unexpected Binary operation"); } IC.Worklist.Add(cast(U)); } } if (isa(OtherVal)) IC.Worklist.Add(cast(OtherVal)); // The original icmp gets replaced with the overflow value, maybe inverted // depending on predicate. bool Inverse = false; switch (I.getPredicate()) { case ICmpInst::ICMP_NE: break; case ICmpInst::ICMP_EQ: Inverse = true; break; case ICmpInst::ICMP_UGT: case ICmpInst::ICMP_UGE: if (I.getOperand(0) == MulVal) break; Inverse = true; break; case ICmpInst::ICMP_ULT: case ICmpInst::ICMP_ULE: if (I.getOperand(1) == MulVal) break; Inverse = true; break; default: llvm_unreachable("Unexpected predicate"); } if (Inverse) { Value *Res = Builder->CreateExtractValue(Call, 1); return BinaryOperator::CreateNot(Res); } return ExtractValueInst::Create(Call, 1); } /// When performing a comparison against a constant, it is possible that not all /// the bits in the LHS are demanded. This helper method computes the mask that /// IS demanded. static APInt DemandedBitsLHSMask(ICmpInst &I, unsigned BitWidth, bool isSignCheck) { if (isSignCheck) return APInt::getSignBit(BitWidth); ConstantInt *CI = dyn_cast(I.getOperand(1)); if (!CI) return APInt::getAllOnesValue(BitWidth); const APInt &RHS = CI->getValue(); switch (I.getPredicate()) { // For a UGT comparison, we don't care about any bits that // correspond to the trailing ones of the comparand. The value of these // bits doesn't impact the outcome of the comparison, because any value // greater than the RHS must differ in a bit higher than these due to carry. case ICmpInst::ICMP_UGT: { unsigned trailingOnes = RHS.countTrailingOnes(); APInt lowBitsSet = APInt::getLowBitsSet(BitWidth, trailingOnes); return ~lowBitsSet; } // Similarly, for a ULT comparison, we don't care about the trailing zeros. // Any value less than the RHS must differ in a higher bit because of carries. case ICmpInst::ICMP_ULT: { unsigned trailingZeros = RHS.countTrailingZeros(); APInt lowBitsSet = APInt::getLowBitsSet(BitWidth, trailingZeros); return ~lowBitsSet; } default: return APInt::getAllOnesValue(BitWidth); } } /// \brief Check if the order of \p Op0 and \p Op1 as operand in an ICmpInst /// should be swapped. /// The decision is based on how many times these two operands are reused /// as subtract operands and their positions in those instructions. /// The rational is that several architectures use the same instruction for /// both subtract and cmp, thus it is better if the order of those operands /// match. /// \return true if Op0 and Op1 should be swapped. static bool swapMayExposeCSEOpportunities(const Value * Op0, const Value * Op1) { // Filter out pointer value as those cannot appears directly in subtract. // FIXME: we may want to go through inttoptrs or bitcasts. if (Op0->getType()->isPointerTy()) return false; // Count every uses of both Op0 and Op1 in a subtract. // Each time Op0 is the first operand, count -1: swapping is bad, the // subtract has already the same layout as the compare. // Each time Op0 is the second operand, count +1: swapping is good, the // subtract has a different layout as the compare. // At the end, if the benefit is greater than 0, Op0 should come second to // expose more CSE opportunities. int GlobalSwapBenefits = 0; for (const User *U : Op0->users()) { const BinaryOperator *BinOp = dyn_cast(U); if (!BinOp || BinOp->getOpcode() != Instruction::Sub) continue; // If Op0 is the first argument, this is not beneficial to swap the // arguments. int LocalSwapBenefits = -1; unsigned Op1Idx = 1; if (BinOp->getOperand(Op1Idx) == Op0) { Op1Idx = 0; LocalSwapBenefits = 1; } if (BinOp->getOperand(Op1Idx) != Op1) continue; GlobalSwapBenefits += LocalSwapBenefits; } return GlobalSwapBenefits > 0; } /// \brief Check that one use is in the same block as the definition and all /// other uses are in blocks dominated by a given block /// /// \param DI Definition /// \param UI Use /// \param DB Block that must dominate all uses of \p DI outside /// the parent block /// \return true when \p UI is the only use of \p DI in the parent block /// and all other uses of \p DI are in blocks dominated by \p DB. /// bool InstCombiner::dominatesAllUses(const Instruction *DI, const Instruction *UI, const BasicBlock *DB) const { assert(DI && UI && "Instruction not defined\n"); // ignore incomplete definitions if (!DI->getParent()) return false; // DI and UI must be in the same block if (DI->getParent() != UI->getParent()) return false; // Protect from self-referencing blocks if (DI->getParent() == DB) return false; // DominatorTree available? if (!DT) return false; for (const User *U : DI->users()) { auto *Usr = cast(U); if (Usr != UI && !DT->dominates(DB, Usr->getParent())) return false; } return true; } /// Return true when the instruction sequence within a block is select-cmp-br. static bool isChainSelectCmpBranch(const SelectInst *SI) { const BasicBlock *BB = SI->getParent(); if (!BB) return false; auto *BI = dyn_cast_or_null(BB->getTerminator()); if (!BI || BI->getNumSuccessors() != 2) return false; auto *IC = dyn_cast(BI->getCondition()); if (!IC || (IC->getOperand(0) != SI && IC->getOperand(1) != SI)) return false; return true; } /// \brief True when a select result is replaced by one of its operands /// in select-icmp sequence. This will eventually result in the elimination /// of the select. /// /// \param SI Select instruction /// \param Icmp Compare instruction /// \param SIOpd Operand that replaces the select /// /// Notes: /// - The replacement is global and requires dominator information /// - The caller is responsible for the actual replacement /// /// Example: /// /// entry: /// %4 = select i1 %3, %C* %0, %C* null /// %5 = icmp eq %C* %4, null /// br i1 %5, label %9, label %7 /// ... /// ;