Index: projects/clang370-import/contrib/llvm/lib/Target/R600/AMDGPUInstrInfo.td =================================================================== --- projects/clang370-import/contrib/llvm/lib/Target/R600/AMDGPUInstrInfo.td (revision 288077) +++ projects/clang370-import/contrib/llvm/lib/Target/R600/AMDGPUInstrInfo.td (nonexistent) @@ -1,245 +0,0 @@ -//===-- AMDGPUInstrInfo.td - AMDGPU DAG nodes --------------*- tablegen -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file contains DAG node defintions for the AMDGPU target. -// -//===----------------------------------------------------------------------===// - -//===----------------------------------------------------------------------===// -// AMDGPU DAG Profiles -//===----------------------------------------------------------------------===// - -def AMDGPUDTIntTernaryOp : SDTypeProfile<1, 3, [ - SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisInt<0>, SDTCisInt<3> -]>; - -def AMDGPUTrigPreOp : SDTypeProfile<1, 2, - [SDTCisSameAs<0, 1>, SDTCisFP<0>, SDTCisInt<2>] ->; - -def AMDGPULdExpOp : SDTypeProfile<1, 2, - [SDTCisSameAs<0, 1>, SDTCisFP<0>, SDTCisInt<2>] ->; - -def AMDGPUFPClassOp : SDTypeProfile<1, 2, - [SDTCisInt<0>, SDTCisFP<1>, SDTCisInt<2>] ->; - -def AMDGPUDivScaleOp : SDTypeProfile<2, 3, - [SDTCisFP<0>, SDTCisInt<1>, SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>, SDTCisSameAs<0, 4>] ->; - -// float, float, float, vcc -def AMDGPUFmasOp : SDTypeProfile<1, 4, - [SDTCisFP<0>, SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>, SDTCisInt<4>] ->; - -//===----------------------------------------------------------------------===// -// AMDGPU DAG Nodes -// - -// This argument to this node is a dword address. -def AMDGPUdwordaddr : SDNode<"AMDGPUISD::DWORDADDR", SDTIntUnaryOp>; - -def AMDGPUcos : SDNode<"AMDGPUISD::COS_HW", SDTFPUnaryOp>; -def AMDGPUsin : SDNode<"AMDGPUISD::SIN_HW", SDTFPUnaryOp>; - -// out = a - floor(a) -def AMDGPUfract : SDNode<"AMDGPUISD::FRACT", SDTFPUnaryOp>; - -// out = 1.0 / a -def AMDGPUrcp : SDNode<"AMDGPUISD::RCP", SDTFPUnaryOp>; - -// out = 1.0 / sqrt(a) -def AMDGPUrsq : SDNode<"AMDGPUISD::RSQ", SDTFPUnaryOp>; - -// out = 1.0 / sqrt(a) -def AMDGPUrsq_legacy : SDNode<"AMDGPUISD::RSQ_LEGACY", SDTFPUnaryOp>; - -// out = 1.0 / sqrt(a) result clamped to +/- max_float. -def AMDGPUrsq_clamped : SDNode<"AMDGPUISD::RSQ_CLAMPED", SDTFPUnaryOp>; - -def AMDGPUldexp : SDNode<"AMDGPUISD::LDEXP", AMDGPULdExpOp>; - -def AMDGPUfp_class : SDNode<"AMDGPUISD::FP_CLASS", AMDGPUFPClassOp>; - -// out = max(a, b) a and b are floats, where a nan comparison fails. -// This is not commutative because this gives the second operand: -// x < nan ? x : nan -> nan -// nan < x ? nan : x -> x -def AMDGPUfmax_legacy : SDNode<"AMDGPUISD::FMAX_LEGACY", SDTFPBinOp, - [] ->; - -def AMDGPUclamp : SDNode<"AMDGPUISD::CLAMP", SDTFPTernaryOp, []>; - -// out = max(a, b) a and b are signed ints -def AMDGPUsmax : SDNode<"AMDGPUISD::SMAX", SDTIntBinOp, - [SDNPCommutative, SDNPAssociative] ->; - -// out = max(a, b) a and b are unsigned ints -def AMDGPUumax : SDNode<"AMDGPUISD::UMAX", SDTIntBinOp, - [SDNPCommutative, SDNPAssociative] ->; - -// out = min(a, b) a and b are floats, where a nan comparison fails. -def AMDGPUfmin_legacy : SDNode<"AMDGPUISD::FMIN_LEGACY", SDTFPBinOp, - [] ->; - -// FIXME: TableGen doesn't like commutative instructions with more -// than 2 operands. -// out = max(a, b, c) a, b and c are floats -def AMDGPUfmax3 : SDNode<"AMDGPUISD::FMAX3", SDTFPTernaryOp, - [/*SDNPCommutative, SDNPAssociative*/] ->; - -// out = max(a, b, c) a, b, and c are signed ints -def AMDGPUsmax3 : SDNode<"AMDGPUISD::SMAX3", AMDGPUDTIntTernaryOp, - [/*SDNPCommutative, SDNPAssociative*/] ->; - -// out = max(a, b, c) a, b and c are unsigned ints -def AMDGPUumax3 : SDNode<"AMDGPUISD::UMAX3", AMDGPUDTIntTernaryOp, - [/*SDNPCommutative, SDNPAssociative*/] ->; - -// out = min(a, b, c) a, b and c are floats -def AMDGPUfmin3 : SDNode<"AMDGPUISD::FMIN3", SDTFPTernaryOp, - [/*SDNPCommutative, SDNPAssociative*/] ->; - -// out = min(a, b, c) a, b and c are signed ints -def AMDGPUsmin3 : SDNode<"AMDGPUISD::SMIN3", AMDGPUDTIntTernaryOp, - [/*SDNPCommutative, SDNPAssociative*/] ->; - -// out = min(a, b) a and b are unsigned ints -def AMDGPUumin3 : SDNode<"AMDGPUISD::UMIN3", AMDGPUDTIntTernaryOp, - [/*SDNPCommutative, SDNPAssociative*/] ->; - -// out = (src0 + src1 > 0xFFFFFFFF) ? 1 : 0 -def AMDGPUcarry : SDNode<"AMDGPUISD::CARRY", SDTIntBinOp, []>; - -// out = (src1 > src0) ? 1 : 0 -def AMDGPUborrow : SDNode<"AMDGPUISD::BORROW", SDTIntBinOp, []>; - - -def AMDGPUcvt_f32_ubyte0 : SDNode<"AMDGPUISD::CVT_F32_UBYTE0", - SDTIntToFPOp, []>; -def AMDGPUcvt_f32_ubyte1 : SDNode<"AMDGPUISD::CVT_F32_UBYTE1", - SDTIntToFPOp, []>; -def AMDGPUcvt_f32_ubyte2 : SDNode<"AMDGPUISD::CVT_F32_UBYTE2", - SDTIntToFPOp, []>; -def AMDGPUcvt_f32_ubyte3 : SDNode<"AMDGPUISD::CVT_F32_UBYTE3", - SDTIntToFPOp, []>; - - -// urecip - This operation is a helper for integer division, it returns the -// result of 1 / a as a fractional unsigned integer. -// out = (2^32 / a) + e -// e is rounding error -def AMDGPUurecip : SDNode<"AMDGPUISD::URECIP", SDTIntUnaryOp>; - -// Special case divide preop and flags. -def AMDGPUdiv_scale : SDNode<"AMDGPUISD::DIV_SCALE", AMDGPUDivScaleOp>; - -// Special case divide FMA with scale and flags (src0 = Quotient, -// src1 = Denominator, src2 = Numerator). -def AMDGPUdiv_fmas : SDNode<"AMDGPUISD::DIV_FMAS", AMDGPUFmasOp>; - -// Single or double precision division fixup. -// Special case divide fixup and flags(src0 = Quotient, src1 = -// Denominator, src2 = Numerator). -def AMDGPUdiv_fixup : SDNode<"AMDGPUISD::DIV_FIXUP", SDTFPTernaryOp>; - -// Look Up 2.0 / pi src0 with segment select src1[4:0] -def AMDGPUtrig_preop : SDNode<"AMDGPUISD::TRIG_PREOP", AMDGPUTrigPreOp>; - -def AMDGPUregister_load : SDNode<"AMDGPUISD::REGISTER_LOAD", - SDTypeProfile<1, 2, [SDTCisPtrTy<1>, SDTCisInt<2>]>, - [SDNPHasChain, SDNPMayLoad]>; - -def AMDGPUregister_store : SDNode<"AMDGPUISD::REGISTER_STORE", - SDTypeProfile<0, 3, [SDTCisPtrTy<1>, SDTCisInt<2>]>, - [SDNPHasChain, SDNPMayStore]>; - -// MSKOR instructions are atomic memory instructions used mainly for storing -// 8-bit and 16-bit values. The definition is: -// -// MSKOR(dst, mask, src) MEM[dst] = ((MEM[dst] & ~mask) | src) -// -// src0: vec4(src, 0, 0, mask) -// src1: dst - rat offset (aka pointer) in dwords -def AMDGPUstore_mskor : SDNode<"AMDGPUISD::STORE_MSKOR", - SDTypeProfile<0, 2, []>, - [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; - -def AMDGPUround : SDNode<"ISD::FROUND", - SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisSameAs<0,1>]>>; - -def AMDGPUbfe_u32 : SDNode<"AMDGPUISD::BFE_U32", AMDGPUDTIntTernaryOp>; -def AMDGPUbfe_i32 : SDNode<"AMDGPUISD::BFE_I32", AMDGPUDTIntTernaryOp>; -def AMDGPUbfi : SDNode<"AMDGPUISD::BFI", AMDGPUDTIntTernaryOp>; -def AMDGPUbfm : SDNode<"AMDGPUISD::BFM", SDTIntBinOp>; - -def AMDGPUbrev : SDNode<"AMDGPUISD::BREV", SDTIntUnaryOp>; - -// Signed and unsigned 24-bit mulitply. The highest 8-bits are ignore when -// performing the mulitply. The result is a 32-bit value. -def AMDGPUmul_u24 : SDNode<"AMDGPUISD::MUL_U24", SDTIntBinOp, - [SDNPCommutative] ->; -def AMDGPUmul_i24 : SDNode<"AMDGPUISD::MUL_I24", SDTIntBinOp, - [SDNPCommutative] ->; - -def AMDGPUmad_u24 : SDNode<"AMDGPUISD::MAD_U24", AMDGPUDTIntTernaryOp, - [] ->; -def AMDGPUmad_i24 : SDNode<"AMDGPUISD::MAD_I24", AMDGPUDTIntTernaryOp, - [] ->; - -def AMDGPUsendmsg : SDNode<"AMDGPUISD::SENDMSG", - SDTypeProfile<0, 1, [SDTCisInt<0>]>, - [SDNPHasChain, SDNPInGlue]>; - -def AMDGPUinterp_mov : SDNode<"AMDGPUISD::INTERP_MOV", - SDTypeProfile<1, 3, [SDTCisFP<0>]>, - [SDNPInGlue]>; - -def AMDGPUinterp_p1 : SDNode<"AMDGPUISD::INTERP_P1", - SDTypeProfile<1, 3, [SDTCisFP<0>]>, - [SDNPInGlue, SDNPOutGlue]>; - -def AMDGPUinterp_p2 : SDNode<"AMDGPUISD::INTERP_P2", - SDTypeProfile<1, 4, [SDTCisFP<0>]>, - [SDNPInGlue]>; - -//===----------------------------------------------------------------------===// -// Flow Control Profile Types -//===----------------------------------------------------------------------===// -// Branch instruction where second and third are basic blocks -def SDTIL_BRCond : SDTypeProfile<0, 2, [ - SDTCisVT<0, OtherVT> - ]>; - -//===----------------------------------------------------------------------===// -// Flow Control DAG Nodes -//===----------------------------------------------------------------------===// -def IL_brcond : SDNode<"AMDGPUISD::BRANCH_COND", SDTIL_BRCond, [SDNPHasChain]>; - -//===----------------------------------------------------------------------===// -// Call/Return DAG Nodes -//===----------------------------------------------------------------------===// -def IL_retflag : SDNode<"AMDGPUISD::RET_FLAG", SDTNone, - [SDNPHasChain, SDNPOptInGlue]>; Index: projects/clang370-import/contrib/llvm/lib/Target/R600/SIRegisterInfo.cpp =================================================================== --- projects/clang370-import/contrib/llvm/lib/Target/R600/SIRegisterInfo.cpp (revision 288077) +++ projects/clang370-import/contrib/llvm/lib/Target/R600/SIRegisterInfo.cpp (nonexistent) @@ -1,543 +0,0 @@ -//===-- SIRegisterInfo.cpp - SI Register Information ---------------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -/// \file -/// \brief SI implementation of the TargetRegisterInfo class. -// -//===----------------------------------------------------------------------===// - - -#include "SIRegisterInfo.h" -#include "SIInstrInfo.h" -#include "SIMachineFunctionInfo.h" -#include "llvm/CodeGen/MachineFrameInfo.h" -#include "llvm/CodeGen/MachineInstrBuilder.h" -#include "llvm/CodeGen/RegisterScavenging.h" -#include "llvm/IR/Function.h" -#include "llvm/IR/LLVMContext.h" - -using namespace llvm; - -SIRegisterInfo::SIRegisterInfo() : AMDGPURegisterInfo() {} - -BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { - BitVector Reserved(getNumRegs()); - Reserved.set(AMDGPU::EXEC); - - // EXEC_LO and EXEC_HI could be allocated and used as regular register, - // but this seems likely to result in bugs, so I'm marking them as reserved. - Reserved.set(AMDGPU::EXEC_LO); - Reserved.set(AMDGPU::EXEC_HI); - - Reserved.set(AMDGPU::INDIRECT_BASE_ADDR); - Reserved.set(AMDGPU::FLAT_SCR); - Reserved.set(AMDGPU::FLAT_SCR_LO); - Reserved.set(AMDGPU::FLAT_SCR_HI); - - // Reserve some VGPRs to use as temp registers in case we have to spill VGPRs - Reserved.set(AMDGPU::VGPR255); - Reserved.set(AMDGPU::VGPR254); - - // Tonga and Iceland can only allocate a fixed number of SGPRs due - // to a hw bug. - if (MF.getSubtarget().hasSGPRInitBug()) { - unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs(); - // Reserve some SGPRs for FLAT_SCRATCH and VCC (4 SGPRs). - // Assume XNACK_MASK is unused. - unsigned Limit = AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG - 4; - - for (unsigned i = Limit; i < NumSGPRs; ++i) { - unsigned Reg = AMDGPU::SGPR_32RegClass.getRegister(i); - MCRegAliasIterator R = MCRegAliasIterator(Reg, this, true); - - for (; R.isValid(); ++R) - Reserved.set(*R); - } - } - - return Reserved; -} - -unsigned SIRegisterInfo::getRegPressureSetLimit(const MachineFunction &MF, - unsigned Idx) const { - - const AMDGPUSubtarget &STI = MF.getSubtarget(); - // FIXME: We should adjust the max number of waves based on LDS size. - unsigned SGPRLimit = getNumSGPRsAllowed(STI.getGeneration(), - STI.getMaxWavesPerCU()); - unsigned VGPRLimit = getNumVGPRsAllowed(STI.getMaxWavesPerCU()); - - for (regclass_iterator I = regclass_begin(), E = regclass_end(); - I != E; ++I) { - - unsigned NumSubRegs = std::max((int)(*I)->getSize() / 4, 1); - unsigned Limit; - - if (isSGPRClass(*I)) { - Limit = SGPRLimit / NumSubRegs; - } else { - Limit = VGPRLimit / NumSubRegs; - } - - const int *Sets = getRegClassPressureSets(*I); - assert(Sets); - for (unsigned i = 0; Sets[i] != -1; ++i) { - if (Sets[i] == (int)Idx) - return Limit; - } - } - return 256; -} - -bool SIRegisterInfo::requiresRegisterScavenging(const MachineFunction &Fn) const { - return Fn.getFrameInfo()->hasStackObjects(); -} - -static unsigned getNumSubRegsForSpillOp(unsigned Op) { - - switch (Op) { - case AMDGPU::SI_SPILL_S512_SAVE: - case AMDGPU::SI_SPILL_S512_RESTORE: - case AMDGPU::SI_SPILL_V512_SAVE: - case AMDGPU::SI_SPILL_V512_RESTORE: - return 16; - case AMDGPU::SI_SPILL_S256_SAVE: - case AMDGPU::SI_SPILL_S256_RESTORE: - case AMDGPU::SI_SPILL_V256_SAVE: - case AMDGPU::SI_SPILL_V256_RESTORE: - return 8; - case AMDGPU::SI_SPILL_S128_SAVE: - case AMDGPU::SI_SPILL_S128_RESTORE: - case AMDGPU::SI_SPILL_V128_SAVE: - case AMDGPU::SI_SPILL_V128_RESTORE: - return 4; - case AMDGPU::SI_SPILL_V96_SAVE: - case AMDGPU::SI_SPILL_V96_RESTORE: - return 3; - case AMDGPU::SI_SPILL_S64_SAVE: - case AMDGPU::SI_SPILL_S64_RESTORE: - case AMDGPU::SI_SPILL_V64_SAVE: - case AMDGPU::SI_SPILL_V64_RESTORE: - return 2; - case AMDGPU::SI_SPILL_S32_SAVE: - case AMDGPU::SI_SPILL_S32_RESTORE: - case AMDGPU::SI_SPILL_V32_SAVE: - case AMDGPU::SI_SPILL_V32_RESTORE: - return 1; - default: llvm_unreachable("Invalid spill opcode"); - } -} - -void SIRegisterInfo::buildScratchLoadStore(MachineBasicBlock::iterator MI, - unsigned LoadStoreOp, - unsigned Value, - unsigned ScratchRsrcReg, - unsigned ScratchOffset, - int64_t Offset, - RegScavenger *RS) const { - - MachineBasicBlock *MBB = MI->getParent(); - const MachineFunction *MF = MI->getParent()->getParent(); - const SIInstrInfo *TII = - static_cast(MF->getSubtarget().getInstrInfo()); - LLVMContext &Ctx = MF->getFunction()->getContext(); - DebugLoc DL = MI->getDebugLoc(); - bool IsLoad = TII->get(LoadStoreOp).mayLoad(); - - bool RanOutOfSGPRs = false; - unsigned SOffset = ScratchOffset; - - unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode()); - unsigned Size = NumSubRegs * 4; - - if (!isUInt<12>(Offset + Size)) { - SOffset = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, MI, 0); - if (SOffset == AMDGPU::NoRegister) { - RanOutOfSGPRs = true; - SOffset = AMDGPU::SGPR0; - } - BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), SOffset) - .addReg(ScratchOffset) - .addImm(Offset); - Offset = 0; - } - - if (RanOutOfSGPRs) - Ctx.emitError("Ran out of SGPRs for spilling VGPRS"); - - for (unsigned i = 0, e = NumSubRegs; i != e; ++i, Offset += 4) { - unsigned SubReg = NumSubRegs > 1 ? - getPhysRegSubReg(Value, &AMDGPU::VGPR_32RegClass, i) : - Value; - bool IsKill = (i == e - 1); - - BuildMI(*MBB, MI, DL, TII->get(LoadStoreOp)) - .addReg(SubReg, getDefRegState(IsLoad)) - .addReg(ScratchRsrcReg, getKillRegState(IsKill)) - .addReg(SOffset) - .addImm(Offset) - .addImm(0) // glc - .addImm(0) // slc - .addImm(0) // tfe - .addReg(Value, RegState::Implicit | getDefRegState(IsLoad)); - } -} - -void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, - int SPAdj, unsigned FIOperandNum, - RegScavenger *RS) const { - MachineFunction *MF = MI->getParent()->getParent(); - MachineBasicBlock *MBB = MI->getParent(); - SIMachineFunctionInfo *MFI = MF->getInfo(); - MachineFrameInfo *FrameInfo = MF->getFrameInfo(); - const SIInstrInfo *TII = - static_cast(MF->getSubtarget().getInstrInfo()); - DebugLoc DL = MI->getDebugLoc(); - - MachineOperand &FIOp = MI->getOperand(FIOperandNum); - int Index = MI->getOperand(FIOperandNum).getIndex(); - - switch (MI->getOpcode()) { - // SGPR register spill - case AMDGPU::SI_SPILL_S512_SAVE: - case AMDGPU::SI_SPILL_S256_SAVE: - case AMDGPU::SI_SPILL_S128_SAVE: - case AMDGPU::SI_SPILL_S64_SAVE: - case AMDGPU::SI_SPILL_S32_SAVE: { - unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode()); - - for (unsigned i = 0, e = NumSubRegs; i < e; ++i) { - unsigned SubReg = getPhysRegSubReg(MI->getOperand(0).getReg(), - &AMDGPU::SGPR_32RegClass, i); - struct SIMachineFunctionInfo::SpilledReg Spill = - MFI->getSpilledReg(MF, Index, i); - - if (Spill.VGPR == AMDGPU::NoRegister) { - LLVMContext &Ctx = MF->getFunction()->getContext(); - Ctx.emitError("Ran out of VGPRs for spilling SGPR"); - } - - BuildMI(*MBB, MI, DL, - TII->getMCOpcodeFromPseudo(AMDGPU::V_WRITELANE_B32), - Spill.VGPR) - .addReg(SubReg) - .addImm(Spill.Lane); - - } - MI->eraseFromParent(); - break; - } - - // SGPR register restore - case AMDGPU::SI_SPILL_S512_RESTORE: - case AMDGPU::SI_SPILL_S256_RESTORE: - case AMDGPU::SI_SPILL_S128_RESTORE: - case AMDGPU::SI_SPILL_S64_RESTORE: - case AMDGPU::SI_SPILL_S32_RESTORE: { - unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode()); - - for (unsigned i = 0, e = NumSubRegs; i < e; ++i) { - unsigned SubReg = getPhysRegSubReg(MI->getOperand(0).getReg(), - &AMDGPU::SGPR_32RegClass, i); - struct SIMachineFunctionInfo::SpilledReg Spill = - MFI->getSpilledReg(MF, Index, i); - - if (Spill.VGPR == AMDGPU::NoRegister) { - LLVMContext &Ctx = MF->getFunction()->getContext(); - Ctx.emitError("Ran out of VGPRs for spilling SGPR"); - } - - BuildMI(*MBB, MI, DL, - TII->getMCOpcodeFromPseudo(AMDGPU::V_READLANE_B32), - SubReg) - .addReg(Spill.VGPR) - .addImm(Spill.Lane) - .addReg(MI->getOperand(0).getReg(), RegState::ImplicitDefine); - } - - // TODO: only do this when it is needed - switch (MF->getSubtarget().getGeneration()) { - case AMDGPUSubtarget::SOUTHERN_ISLANDS: - // "VALU writes SGPR" -> "SMRD reads that SGPR" needs "S_NOP 3" on SI - TII->insertNOPs(MI, 3); - break; - case AMDGPUSubtarget::SEA_ISLANDS: - break; - default: // VOLCANIC_ISLANDS and later - // "VALU writes SGPR -> VMEM reads that SGPR" needs "S_NOP 4" on VI - // and later. This also applies to VALUs which write VCC, but we're - // unlikely to see VMEM use VCC. - TII->insertNOPs(MI, 4); - } - - MI->eraseFromParent(); - break; - } - - // VGPR register spill - case AMDGPU::SI_SPILL_V512_SAVE: - case AMDGPU::SI_SPILL_V256_SAVE: - case AMDGPU::SI_SPILL_V128_SAVE: - case AMDGPU::SI_SPILL_V96_SAVE: - case AMDGPU::SI_SPILL_V64_SAVE: - case AMDGPU::SI_SPILL_V32_SAVE: - buildScratchLoadStore(MI, AMDGPU::BUFFER_STORE_DWORD_OFFSET, - TII->getNamedOperand(*MI, AMDGPU::OpName::src)->getReg(), - TII->getNamedOperand(*MI, AMDGPU::OpName::scratch_rsrc)->getReg(), - TII->getNamedOperand(*MI, AMDGPU::OpName::scratch_offset)->getReg(), - FrameInfo->getObjectOffset(Index), RS); - MI->eraseFromParent(); - break; - case AMDGPU::SI_SPILL_V32_RESTORE: - case AMDGPU::SI_SPILL_V64_RESTORE: - case AMDGPU::SI_SPILL_V96_RESTORE: - case AMDGPU::SI_SPILL_V128_RESTORE: - case AMDGPU::SI_SPILL_V256_RESTORE: - case AMDGPU::SI_SPILL_V512_RESTORE: { - buildScratchLoadStore(MI, AMDGPU::BUFFER_LOAD_DWORD_OFFSET, - TII->getNamedOperand(*MI, AMDGPU::OpName::dst)->getReg(), - TII->getNamedOperand(*MI, AMDGPU::OpName::scratch_rsrc)->getReg(), - TII->getNamedOperand(*MI, AMDGPU::OpName::scratch_offset)->getReg(), - FrameInfo->getObjectOffset(Index), RS); - MI->eraseFromParent(); - break; - } - - default: { - int64_t Offset = FrameInfo->getObjectOffset(Index); - FIOp.ChangeToImmediate(Offset); - if (!TII->isImmOperandLegal(MI, FIOperandNum, FIOp)) { - unsigned TmpReg = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, SPAdj); - BuildMI(*MBB, MI, MI->getDebugLoc(), - TII->get(AMDGPU::V_MOV_B32_e32), TmpReg) - .addImm(Offset); - FIOp.ChangeToRegister(TmpReg, false, false, true); - } - } - } -} - -const TargetRegisterClass * SIRegisterInfo::getCFGStructurizerRegClass( - MVT VT) const { - switch(VT.SimpleTy) { - default: - case MVT::i32: return &AMDGPU::VGPR_32RegClass; - } -} - -unsigned SIRegisterInfo::getHWRegIndex(unsigned Reg) const { - return getEncodingValue(Reg) & 0xff; -} - -const TargetRegisterClass *SIRegisterInfo::getPhysRegClass(unsigned Reg) const { - assert(!TargetRegisterInfo::isVirtualRegister(Reg)); - - static const TargetRegisterClass *BaseClasses[] = { - &AMDGPU::VGPR_32RegClass, - &AMDGPU::SReg_32RegClass, - &AMDGPU::VReg_64RegClass, - &AMDGPU::SReg_64RegClass, - &AMDGPU::VReg_96RegClass, - &AMDGPU::VReg_128RegClass, - &AMDGPU::SReg_128RegClass, - &AMDGPU::VReg_256RegClass, - &AMDGPU::SReg_256RegClass, - &AMDGPU::VReg_512RegClass - }; - - for (const TargetRegisterClass *BaseClass : BaseClasses) { - if (BaseClass->contains(Reg)) { - return BaseClass; - } - } - return nullptr; -} - -bool SIRegisterInfo::hasVGPRs(const TargetRegisterClass *RC) const { - return getCommonSubClass(&AMDGPU::VGPR_32RegClass, RC) || - getCommonSubClass(&AMDGPU::VReg_64RegClass, RC) || - getCommonSubClass(&AMDGPU::VReg_96RegClass, RC) || - getCommonSubClass(&AMDGPU::VReg_128RegClass, RC) || - getCommonSubClass(&AMDGPU::VReg_256RegClass, RC) || - getCommonSubClass(&AMDGPU::VReg_512RegClass, RC); -} - -const TargetRegisterClass *SIRegisterInfo::getEquivalentVGPRClass( - const TargetRegisterClass *SRC) const { - if (hasVGPRs(SRC)) { - return SRC; - } else if (SRC == &AMDGPU::SCCRegRegClass) { - return &AMDGPU::VCCRegRegClass; - } else if (getCommonSubClass(SRC, &AMDGPU::SGPR_32RegClass)) { - return &AMDGPU::VGPR_32RegClass; - } else if (getCommonSubClass(SRC, &AMDGPU::SGPR_64RegClass)) { - return &AMDGPU::VReg_64RegClass; - } else if (getCommonSubClass(SRC, &AMDGPU::SReg_128RegClass)) { - return &AMDGPU::VReg_128RegClass; - } else if (getCommonSubClass(SRC, &AMDGPU::SReg_256RegClass)) { - return &AMDGPU::VReg_256RegClass; - } else if (getCommonSubClass(SRC, &AMDGPU::SReg_512RegClass)) { - return &AMDGPU::VReg_512RegClass; - } - return nullptr; -} - -const TargetRegisterClass *SIRegisterInfo::getSubRegClass( - const TargetRegisterClass *RC, unsigned SubIdx) const { - if (SubIdx == AMDGPU::NoSubRegister) - return RC; - - // If this register has a sub-register, we can safely assume it is a 32-bit - // register, because all of SI's sub-registers are 32-bit. - if (isSGPRClass(RC)) { - return &AMDGPU::SGPR_32RegClass; - } else { - return &AMDGPU::VGPR_32RegClass; - } -} - -unsigned SIRegisterInfo::getPhysRegSubReg(unsigned Reg, - const TargetRegisterClass *SubRC, - unsigned Channel) const { - - switch (Reg) { - case AMDGPU::VCC: - switch(Channel) { - case 0: return AMDGPU::VCC_LO; - case 1: return AMDGPU::VCC_HI; - default: llvm_unreachable("Invalid SubIdx for VCC"); - } - - case AMDGPU::FLAT_SCR: - switch (Channel) { - case 0: - return AMDGPU::FLAT_SCR_LO; - case 1: - return AMDGPU::FLAT_SCR_HI; - default: - llvm_unreachable("Invalid SubIdx for FLAT_SCR"); - } - break; - - case AMDGPU::EXEC: - switch (Channel) { - case 0: - return AMDGPU::EXEC_LO; - case 1: - return AMDGPU::EXEC_HI; - default: - llvm_unreachable("Invalid SubIdx for EXEC"); - } - break; - } - - const TargetRegisterClass *RC = getPhysRegClass(Reg); - // 32-bit registers don't have sub-registers, so we can just return the - // Reg. We need to have this check here, because the calculation below - // using getHWRegIndex() will fail with special 32-bit registers like - // VCC_LO, VCC_HI, EXEC_LO, EXEC_HI and M0. - if (RC->getSize() == 4) { - assert(Channel == 0); - return Reg; - } - - unsigned Index = getHWRegIndex(Reg); - return SubRC->getRegister(Index + Channel); -} - -bool SIRegisterInfo::opCanUseLiteralConstant(unsigned OpType) const { - return OpType == AMDGPU::OPERAND_REG_IMM32; -} - -bool SIRegisterInfo::opCanUseInlineConstant(unsigned OpType) const { - if (opCanUseLiteralConstant(OpType)) - return true; - - return OpType == AMDGPU::OPERAND_REG_INLINE_C; -} - -unsigned SIRegisterInfo::getPreloadedValue(const MachineFunction &MF, - enum PreloadedValue Value) const { - - const SIMachineFunctionInfo *MFI = MF.getInfo(); - switch (Value) { - case SIRegisterInfo::TGID_X: - return AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 0); - case SIRegisterInfo::TGID_Y: - return AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 1); - case SIRegisterInfo::TGID_Z: - return AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 2); - case SIRegisterInfo::SCRATCH_WAVE_OFFSET: - if (MFI->getShaderType() != ShaderType::COMPUTE) - return MFI->ScratchOffsetReg; - return AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 4); - case SIRegisterInfo::SCRATCH_PTR: - return AMDGPU::SGPR2_SGPR3; - case SIRegisterInfo::INPUT_PTR: - return AMDGPU::SGPR0_SGPR1; - case SIRegisterInfo::TIDIG_X: - return AMDGPU::VGPR0; - case SIRegisterInfo::TIDIG_Y: - return AMDGPU::VGPR1; - case SIRegisterInfo::TIDIG_Z: - return AMDGPU::VGPR2; - } - llvm_unreachable("unexpected preloaded value type"); -} - -/// \brief Returns a register that is not used at any point in the function. -/// If all registers are used, then this function will return -// AMDGPU::NoRegister. -unsigned SIRegisterInfo::findUnusedRegister(const MachineRegisterInfo &MRI, - const TargetRegisterClass *RC) const { - - for (TargetRegisterClass::iterator I = RC->begin(), E = RC->end(); - I != E; ++I) { - if (!MRI.isPhysRegUsed(*I)) - return *I; - } - return AMDGPU::NoRegister; -} - -unsigned SIRegisterInfo::getNumVGPRsAllowed(unsigned WaveCount) const { - switch(WaveCount) { - case 10: return 24; - case 9: return 28; - case 8: return 32; - case 7: return 36; - case 6: return 40; - case 5: return 48; - case 4: return 64; - case 3: return 84; - case 2: return 128; - default: return 256; - } -} - -unsigned SIRegisterInfo::getNumSGPRsAllowed(AMDGPUSubtarget::Generation gen, - unsigned WaveCount) const { - if (gen >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { - switch (WaveCount) { - case 10: return 80; - case 9: return 80; - case 8: return 96; - default: return 102; - } - } else { - switch(WaveCount) { - case 10: return 48; - case 9: return 56; - case 8: return 64; - case 7: return 72; - case 6: return 80; - case 5: return 96; - default: return 103; - } - } -} Index: projects/clang370-import/contrib/llvm/lib/Target/R600/SIInsertWaits.cpp =================================================================== --- projects/clang370-import/contrib/llvm/lib/Target/R600/SIInsertWaits.cpp (revision 288077) +++ projects/clang370-import/contrib/llvm/lib/Target/R600/SIInsertWaits.cpp (nonexistent) @@ -1,480 +0,0 @@ -//===-- SILowerControlFlow.cpp - Use predicates for control flow ----------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -/// \file -/// \brief Insert wait instructions for memory reads and writes. -/// -/// Memory reads and writes are issued asynchronously, so we need to insert -/// S_WAITCNT instructions when we want to access any of their results or -/// overwrite any register that's used asynchronously. -// -//===----------------------------------------------------------------------===// - -#include "AMDGPU.h" -#include "AMDGPUSubtarget.h" -#include "SIDefines.h" -#include "SIInstrInfo.h" -#include "SIMachineFunctionInfo.h" -#include "llvm/CodeGen/MachineFunction.h" -#include "llvm/CodeGen/MachineFunctionPass.h" -#include "llvm/CodeGen/MachineInstrBuilder.h" -#include "llvm/CodeGen/MachineRegisterInfo.h" - -using namespace llvm; - -namespace { - -/// \brief One variable for each of the hardware counters -typedef union { - struct { - unsigned VM; - unsigned EXP; - unsigned LGKM; - } Named; - unsigned Array[3]; - -} Counters; - -typedef enum { - OTHER, - SMEM, - VMEM -} InstType; - -typedef Counters RegCounters[512]; -typedef std::pair RegInterval; - -class SIInsertWaits : public MachineFunctionPass { - -private: - static char ID; - const SIInstrInfo *TII; - const SIRegisterInfo *TRI; - const MachineRegisterInfo *MRI; - - /// \brief Constant hardware limits - static const Counters WaitCounts; - - /// \brief Constant zero value - static const Counters ZeroCounts; - - /// \brief Counter values we have already waited on. - Counters WaitedOn; - - /// \brief Counter values for last instruction issued. - Counters LastIssued; - - /// \brief Registers used by async instructions. - RegCounters UsedRegs; - - /// \brief Registers defined by async instructions. - RegCounters DefinedRegs; - - /// \brief Different export instruction types seen since last wait. - unsigned ExpInstrTypesSeen; - - /// \brief Type of the last opcode. - InstType LastOpcodeType; - - bool LastInstWritesM0; - - /// \brief Get increment/decrement amount for this instruction. - Counters getHwCounts(MachineInstr &MI); - - /// \brief Is operand relevant for async execution? - bool isOpRelevant(MachineOperand &Op); - - /// \brief Get register interval an operand affects. - RegInterval getRegInterval(MachineOperand &Op); - - /// \brief Handle instructions async components - void pushInstruction(MachineBasicBlock &MBB, - MachineBasicBlock::iterator I); - - /// \brief Insert the actual wait instruction - bool insertWait(MachineBasicBlock &MBB, - MachineBasicBlock::iterator I, - const Counters &Counts); - - /// \brief Do we need def2def checks? - bool unorderedDefines(MachineInstr &MI); - - /// \brief Resolve all operand dependencies to counter requirements - Counters handleOperands(MachineInstr &MI); - - /// \brief Insert S_NOP between an instruction writing M0 and S_SENDMSG. - void handleSendMsg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I); - -public: - SIInsertWaits(TargetMachine &tm) : - MachineFunctionPass(ID), - TII(nullptr), - TRI(nullptr), - ExpInstrTypesSeen(0) { } - - bool runOnMachineFunction(MachineFunction &MF) override; - - const char *getPassName() const override { - return "SI insert wait instructions"; - } - -}; - -} // End anonymous namespace - -char SIInsertWaits::ID = 0; - -const Counters SIInsertWaits::WaitCounts = { { 15, 7, 7 } }; -const Counters SIInsertWaits::ZeroCounts = { { 0, 0, 0 } }; - -FunctionPass *llvm::createSIInsertWaits(TargetMachine &tm) { - return new SIInsertWaits(tm); -} - -Counters SIInsertWaits::getHwCounts(MachineInstr &MI) { - - uint64_t TSFlags = TII->get(MI.getOpcode()).TSFlags; - Counters Result; - - Result.Named.VM = !!(TSFlags & SIInstrFlags::VM_CNT); - - // Only consider stores or EXP for EXP_CNT - Result.Named.EXP = !!(TSFlags & SIInstrFlags::EXP_CNT && - (MI.getOpcode() == AMDGPU::EXP || MI.getDesc().mayStore())); - - // LGKM may uses larger values - if (TSFlags & SIInstrFlags::LGKM_CNT) { - - if (TII->isSMRD(MI.getOpcode())) { - - MachineOperand &Op = MI.getOperand(0); - assert(Op.isReg() && "First LGKM operand must be a register!"); - - unsigned Reg = Op.getReg(); - unsigned Size = TRI->getMinimalPhysRegClass(Reg)->getSize(); - Result.Named.LGKM = Size > 4 ? 2 : 1; - - } else { - // DS - Result.Named.LGKM = 1; - } - - } else { - Result.Named.LGKM = 0; - } - - return Result; -} - -bool SIInsertWaits::isOpRelevant(MachineOperand &Op) { - - // Constants are always irrelevant - if (!Op.isReg()) - return false; - - // Defines are always relevant - if (Op.isDef()) - return true; - - // For exports all registers are relevant - MachineInstr &MI = *Op.getParent(); - if (MI.getOpcode() == AMDGPU::EXP) - return true; - - // For stores the stored value is also relevant - if (!MI.getDesc().mayStore()) - return false; - - // Check if this operand is the value being stored. - // Special case for DS instructions, since the address - // operand comes before the value operand and it may have - // multiple data operands. - - if (TII->isDS(MI.getOpcode())) { - MachineOperand *Data = TII->getNamedOperand(MI, AMDGPU::OpName::data); - if (Data && Op.isIdenticalTo(*Data)) - return true; - - MachineOperand *Data0 = TII->getNamedOperand(MI, AMDGPU::OpName::data0); - if (Data0 && Op.isIdenticalTo(*Data0)) - return true; - - MachineOperand *Data1 = TII->getNamedOperand(MI, AMDGPU::OpName::data1); - if (Data1 && Op.isIdenticalTo(*Data1)) - return true; - - return false; - } - - // NOTE: This assumes that the value operand is before the - // address operand, and that there is only one value operand. - for (MachineInstr::mop_iterator I = MI.operands_begin(), - E = MI.operands_end(); I != E; ++I) { - - if (I->isReg() && I->isUse()) - return Op.isIdenticalTo(*I); - } - - return false; -} - -RegInterval SIInsertWaits::getRegInterval(MachineOperand &Op) { - - if (!Op.isReg() || !TRI->isInAllocatableClass(Op.getReg())) - return std::make_pair(0, 0); - - unsigned Reg = Op.getReg(); - unsigned Size = TRI->getMinimalPhysRegClass(Reg)->getSize(); - - assert(Size >= 4); - - RegInterval Result; - Result.first = TRI->getEncodingValue(Reg); - Result.second = Result.first + Size / 4; - - return Result; -} - -void SIInsertWaits::pushInstruction(MachineBasicBlock &MBB, - MachineBasicBlock::iterator I) { - - // Get the hardware counter increments and sum them up - Counters Increment = getHwCounts(*I); - unsigned Sum = 0; - - for (unsigned i = 0; i < 3; ++i) { - LastIssued.Array[i] += Increment.Array[i]; - Sum += Increment.Array[i]; - } - - // If we don't increase anything then that's it - if (Sum == 0) { - LastOpcodeType = OTHER; - return; - } - - if (MBB.getParent()->getSubtarget().getGeneration() >= - AMDGPUSubtarget::VOLCANIC_ISLANDS) { - // Any occurence of consecutive VMEM or SMEM instructions forms a VMEM - // or SMEM clause, respectively. - // - // The temporary workaround is to break the clauses with S_NOP. - // - // The proper solution would be to allocate registers such that all source - // and destination registers don't overlap, e.g. this is illegal: - // r0 = load r2 - // r2 = load r0 - if ((LastOpcodeType == SMEM && TII->isSMRD(I->getOpcode())) || - (LastOpcodeType == VMEM && Increment.Named.VM)) { - // Insert a NOP to break the clause. - BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_NOP)) - .addImm(0); - LastInstWritesM0 = false; - } - - if (TII->isSMRD(I->getOpcode())) - LastOpcodeType = SMEM; - else if (Increment.Named.VM) - LastOpcodeType = VMEM; - } - - // Remember which export instructions we have seen - if (Increment.Named.EXP) { - ExpInstrTypesSeen |= I->getOpcode() == AMDGPU::EXP ? 1 : 2; - } - - for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) { - - MachineOperand &Op = I->getOperand(i); - if (!isOpRelevant(Op)) - continue; - - RegInterval Interval = getRegInterval(Op); - for (unsigned j = Interval.first; j < Interval.second; ++j) { - - // Remember which registers we define - if (Op.isDef()) - DefinedRegs[j] = LastIssued; - - // and which one we are using - if (Op.isUse()) - UsedRegs[j] = LastIssued; - } - } -} - -bool SIInsertWaits::insertWait(MachineBasicBlock &MBB, - MachineBasicBlock::iterator I, - const Counters &Required) { - - // End of program? No need to wait on anything - if (I != MBB.end() && I->getOpcode() == AMDGPU::S_ENDPGM) - return false; - - // Figure out if the async instructions execute in order - bool Ordered[3]; - - // VM_CNT is always ordered - Ordered[0] = true; - - // EXP_CNT is unordered if we have both EXP & VM-writes - Ordered[1] = ExpInstrTypesSeen == 3; - - // LGKM_CNT is handled as always unordered. TODO: Handle LDS and GDS - Ordered[2] = false; - - // The values we are going to put into the S_WAITCNT instruction - Counters Counts = WaitCounts; - - // Do we really need to wait? - bool NeedWait = false; - - for (unsigned i = 0; i < 3; ++i) { - - if (Required.Array[i] <= WaitedOn.Array[i]) - continue; - - NeedWait = true; - - if (Ordered[i]) { - unsigned Value = LastIssued.Array[i] - Required.Array[i]; - - // Adjust the value to the real hardware possibilities. - Counts.Array[i] = std::min(Value, WaitCounts.Array[i]); - - } else - Counts.Array[i] = 0; - - // Remember on what we have waited on. - WaitedOn.Array[i] = LastIssued.Array[i] - Counts.Array[i]; - } - - if (!NeedWait) - return false; - - // Reset EXP_CNT instruction types - if (Counts.Named.EXP == 0) - ExpInstrTypesSeen = 0; - - // Build the wait instruction - BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)) - .addImm((Counts.Named.VM & 0xF) | - ((Counts.Named.EXP & 0x7) << 4) | - ((Counts.Named.LGKM & 0x7) << 8)); - - LastOpcodeType = OTHER; - LastInstWritesM0 = false; - return true; -} - -/// \brief helper function for handleOperands -static void increaseCounters(Counters &Dst, const Counters &Src) { - - for (unsigned i = 0; i < 3; ++i) - Dst.Array[i] = std::max(Dst.Array[i], Src.Array[i]); -} - -Counters SIInsertWaits::handleOperands(MachineInstr &MI) { - - Counters Result = ZeroCounts; - - // S_SENDMSG implicitly waits for all outstanding LGKM transfers to finish, - // but we also want to wait for any other outstanding transfers before - // signalling other hardware blocks - if (MI.getOpcode() == AMDGPU::S_SENDMSG) - return LastIssued; - - // For each register affected by this - // instruction increase the result sequence - for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { - - MachineOperand &Op = MI.getOperand(i); - RegInterval Interval = getRegInterval(Op); - for (unsigned j = Interval.first; j < Interval.second; ++j) { - - if (Op.isDef()) { - increaseCounters(Result, UsedRegs[j]); - increaseCounters(Result, DefinedRegs[j]); - } - - if (Op.isUse()) - increaseCounters(Result, DefinedRegs[j]); - } - } - - return Result; -} - -void SIInsertWaits::handleSendMsg(MachineBasicBlock &MBB, - MachineBasicBlock::iterator I) { - if (MBB.getParent()->getSubtarget().getGeneration() < - AMDGPUSubtarget::VOLCANIC_ISLANDS) - return; - - // There must be "S_NOP 0" between an instruction writing M0 and S_SENDMSG. - if (LastInstWritesM0 && I->getOpcode() == AMDGPU::S_SENDMSG) { - BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_NOP)).addImm(0); - LastInstWritesM0 = false; - return; - } - - // Set whether this instruction sets M0 - LastInstWritesM0 = false; - - unsigned NumOperands = I->getNumOperands(); - for (unsigned i = 0; i < NumOperands; i++) { - const MachineOperand &Op = I->getOperand(i); - - if (Op.isReg() && Op.isDef() && Op.getReg() == AMDGPU::M0) - LastInstWritesM0 = true; - } -} - -// FIXME: Insert waits listed in Table 4.2 "Required User-Inserted Wait States" -// around other non-memory instructions. -bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) { - bool Changes = false; - - TII = static_cast(MF.getSubtarget().getInstrInfo()); - TRI = - static_cast(MF.getSubtarget().getRegisterInfo()); - - MRI = &MF.getRegInfo(); - - WaitedOn = ZeroCounts; - LastIssued = ZeroCounts; - LastOpcodeType = OTHER; - LastInstWritesM0 = false; - - memset(&UsedRegs, 0, sizeof(UsedRegs)); - memset(&DefinedRegs, 0, sizeof(DefinedRegs)); - - for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); - BI != BE; ++BI) { - - MachineBasicBlock &MBB = *BI; - for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); - I != E; ++I) { - - // Wait for everything before a barrier. - if (I->getOpcode() == AMDGPU::S_BARRIER) - Changes |= insertWait(MBB, I, LastIssued); - else - Changes |= insertWait(MBB, I, handleOperands(*I)); - - pushInstruction(MBB, I); - handleSendMsg(MBB, I); - } - - // Wait for everything at the end of the MBB - Changes |= insertWait(MBB, MBB.getFirstTerminator(), LastIssued); - } - - return Changes; -} Index: projects/clang370-import/contrib/llvm/lib/Target/R600/AMDGPURegisterInfo.h =================================================================== --- projects/clang370-import/contrib/llvm/lib/Target/R600/AMDGPURegisterInfo.h (revision 288077) +++ projects/clang370-import/contrib/llvm/lib/Target/R600/AMDGPURegisterInfo.h (nonexistent) @@ -1,64 +0,0 @@ -//===-- AMDGPURegisterInfo.h - AMDGPURegisterInfo Interface -*- C++ -*-----===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -/// \file -/// \brief TargetRegisterInfo interface that is implemented by all hw codegen -/// targets. -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIB_TARGET_R600_AMDGPUREGISTERINFO_H -#define LLVM_LIB_TARGET_R600_AMDGPUREGISTERINFO_H - -#include "llvm/ADT/BitVector.h" -#include "llvm/Target/TargetRegisterInfo.h" - -#define GET_REGINFO_HEADER -#define GET_REGINFO_ENUM -#include "AMDGPUGenRegisterInfo.inc" - -namespace llvm { - -class AMDGPUSubtarget; -class TargetInstrInfo; - -struct AMDGPURegisterInfo : public AMDGPUGenRegisterInfo { - static const MCPhysReg CalleeSavedReg; - - AMDGPURegisterInfo(); - - BitVector getReservedRegs(const MachineFunction &MF) const override { - assert(!"Unimplemented"); return BitVector(); - } - - virtual const TargetRegisterClass* getCFGStructurizerRegClass(MVT VT) const { - assert(!"Unimplemented"); return nullptr; - } - - virtual unsigned getHWRegIndex(unsigned Reg) const { - assert(!"Unimplemented"); return 0; - } - - /// \returns the sub reg enum value for the given \p Channel - /// (e.g. getSubRegFromChannel(0) -> AMDGPU::sub0) - unsigned getSubRegFromChannel(unsigned Channel) const; - - const MCPhysReg* getCalleeSavedRegs(const MachineFunction *MF) const override; - void eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj, - unsigned FIOperandNum, - RegScavenger *RS) const override; - unsigned getFrameRegister(const MachineFunction &MF) const override; - - unsigned getIndirectSubReg(unsigned IndirectIndex) const; - -}; - -} // End namespace llvm - -#endif Index: projects/clang370-import/contrib/llvm/lib/Target/R600/SIInstructions.td =================================================================== --- projects/clang370-import/contrib/llvm/lib/Target/R600/SIInstructions.td (revision 288077) +++ projects/clang370-import/contrib/llvm/lib/Target/R600/SIInstructions.td (nonexistent) @@ -1,3435 +0,0 @@ -//===-- SIInstructions.td - SI Instruction Defintions ---------------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// This file was originally auto-generated from a GPU register header file and -// all the instruction definitions were originally commented out. Instructions -// that are not yet supported remain commented out. -//===----------------------------------------------------------------------===// - -class InterpSlots { -int P0 = 2; -int P10 = 0; -int P20 = 1; -} -def INTERP : InterpSlots; - -def InterpSlot : Operand { - let PrintMethod = "printInterpSlot"; -} - -def SendMsgImm : Operand { - let PrintMethod = "printSendMsg"; -} - -def isGCN : Predicate<"Subtarget->getGeneration() " - ">= AMDGPUSubtarget::SOUTHERN_ISLANDS">, - AssemblerPredicate<"FeatureGCN">; -def isSI : Predicate<"Subtarget->getGeneration() " - "== AMDGPUSubtarget::SOUTHERN_ISLANDS">; - -def HasFlatAddressSpace : Predicate<"Subtarget.hasFlatAddressSpace()">; - -def has16BankLDS : Predicate<"Subtarget->getLDSBankCount() == 16">; -def has32BankLDS : Predicate<"Subtarget->getLDSBankCount() == 32">; - -def SWaitMatchClass : AsmOperandClass { - let Name = "SWaitCnt"; - let RenderMethod = "addImmOperands"; - let ParserMethod = "parseSWaitCntOps"; -} - -def WAIT_FLAG : InstFlag<"printWaitFlag"> { - let ParserMatchClass = SWaitMatchClass; -} - -let SubtargetPredicate = isGCN in { - -//===----------------------------------------------------------------------===// -// EXP Instructions -//===----------------------------------------------------------------------===// - -defm EXP : EXP_m; - -//===----------------------------------------------------------------------===// -// SMRD Instructions -//===----------------------------------------------------------------------===// - -let mayLoad = 1 in { - -// We are using the SGPR_32 and not the SReg_32 register class for 32-bit -// SMRD instructions, because the SGPR_32 register class does not include M0 -// and writing to M0 from an SMRD instruction will hang the GPU. -defm S_LOAD_DWORD : SMRD_Helper <0x00, "s_load_dword", SReg_64, SGPR_32>; -defm S_LOAD_DWORDX2 : SMRD_Helper <0x01, "s_load_dwordx2", SReg_64, SReg_64>; -defm S_LOAD_DWORDX4 : SMRD_Helper <0x02, "s_load_dwordx4", SReg_64, SReg_128>; -defm S_LOAD_DWORDX8 : SMRD_Helper <0x03, "s_load_dwordx8", SReg_64, SReg_256>; -defm S_LOAD_DWORDX16 : SMRD_Helper <0x04, "s_load_dwordx16", SReg_64, SReg_512>; - -defm S_BUFFER_LOAD_DWORD : SMRD_Helper < - 0x08, "s_buffer_load_dword", SReg_128, SGPR_32 ->; - -defm S_BUFFER_LOAD_DWORDX2 : SMRD_Helper < - 0x09, "s_buffer_load_dwordx2", SReg_128, SReg_64 ->; - -defm S_BUFFER_LOAD_DWORDX4 : SMRD_Helper < - 0x0a, "s_buffer_load_dwordx4", SReg_128, SReg_128 ->; - -defm S_BUFFER_LOAD_DWORDX8 : SMRD_Helper < - 0x0b, "s_buffer_load_dwordx8", SReg_128, SReg_256 ->; - -defm S_BUFFER_LOAD_DWORDX16 : SMRD_Helper < - 0x0c, "s_buffer_load_dwordx16", SReg_128, SReg_512 ->; - -} // mayLoad = 1 - -//def S_MEMTIME : SMRD_ <0x0000001e, "s_memtime", []>; -//def S_DCACHE_INV : SMRD_ <0x0000001f, "s_dcache_inv", []>; - -//===----------------------------------------------------------------------===// -// SOP1 Instructions -//===----------------------------------------------------------------------===// - -let isMoveImm = 1 in { - let isReMaterializable = 1, isAsCheapAsAMove = 1 in { - defm S_MOV_B32 : SOP1_32 , "s_mov_b32", []>; - defm S_MOV_B64 : SOP1_64 , "s_mov_b64", []>; - } // let isRematerializeable = 1 - - let Uses = [SCC] in { - defm S_CMOV_B32 : SOP1_32 , "s_cmov_b32", []>; - defm S_CMOV_B64 : SOP1_64 , "s_cmov_b64", []>; - } // End Uses = [SCC] -} // End isMoveImm = 1 - -let Defs = [SCC] in { - defm S_NOT_B32 : SOP1_32 , "s_not_b32", - [(set i32:$dst, (not i32:$src0))] - >; - - defm S_NOT_B64 : SOP1_64 , "s_not_b64", - [(set i64:$dst, (not i64:$src0))] - >; - defm S_WQM_B32 : SOP1_32 , "s_wqm_b32", []>; - defm S_WQM_B64 : SOP1_64 , "s_wqm_b64", []>; -} // End Defs = [SCC] - - -defm S_BREV_B32 : SOP1_32 , "s_brev_b32", - [(set i32:$dst, (AMDGPUbrev i32:$src0))] ->; -defm S_BREV_B64 : SOP1_64 , "s_brev_b64", []>; - -let Defs = [SCC] in { - defm S_BCNT0_I32_B32 : SOP1_32 , "s_bcnt0_i32_b32", []>; - defm S_BCNT0_I32_B64 : SOP1_32_64 , "s_bcnt0_i32_b64", []>; - defm S_BCNT1_I32_B32 : SOP1_32 , "s_bcnt1_i32_b32", - [(set i32:$dst, (ctpop i32:$src0))] - >; - defm S_BCNT1_I32_B64 : SOP1_32_64 , "s_bcnt1_i32_b64", []>; -} // End Defs = [SCC] - -defm S_FF0_I32_B32 : SOP1_32 , "s_ff0_i32_b32", []>; -defm S_FF0_I32_B64 : SOP1_32_64 , "s_ff0_i32_b64", []>; -defm S_FF1_I32_B32 : SOP1_32 , "s_ff1_i32_b32", - [(set i32:$dst, (cttz_zero_undef i32:$src0))] ->; -defm S_FF1_I32_B64 : SOP1_32_64 , "s_ff1_i32_b64", []>; - -defm S_FLBIT_I32_B32 : SOP1_32 , "s_flbit_i32_b32", - [(set i32:$dst, (ctlz_zero_undef i32:$src0))] ->; - -defm S_FLBIT_I32_B64 : SOP1_32_64 , "s_flbit_i32_b64", []>; -defm S_FLBIT_I32 : SOP1_32 , "s_flbit_i32", - [(set i32:$dst, (int_AMDGPU_flbit_i32 i32:$src0))] ->; -defm S_FLBIT_I32_I64 : SOP1_32_64 , "s_flbit_i32_i64", []>; -defm S_SEXT_I32_I8 : SOP1_32 , "s_sext_i32_i8", - [(set i32:$dst, (sext_inreg i32:$src0, i8))] ->; -defm S_SEXT_I32_I16 : SOP1_32 , "s_sext_i32_i16", - [(set i32:$dst, (sext_inreg i32:$src0, i16))] ->; - -defm S_BITSET0_B32 : SOP1_32 , "s_bitset0_b32", []>; -defm S_BITSET0_B64 : SOP1_64 , "s_bitset0_b64", []>; -defm S_BITSET1_B32 : SOP1_32 , "s_bitset1_b32", []>; -defm S_BITSET1_B64 : SOP1_64 , "s_bitset1_b64", []>; -defm S_GETPC_B64 : SOP1_64_0 , "s_getpc_b64", []>; -defm S_SETPC_B64 : SOP1_64 , "s_setpc_b64", []>; -defm S_SWAPPC_B64 : SOP1_64 , "s_swappc_b64", []>; -defm S_RFE_B64 : SOP1_64 , "s_rfe_b64", []>; - -let hasSideEffects = 1, Uses = [EXEC], Defs = [EXEC, SCC] in { - -defm S_AND_SAVEEXEC_B64 : SOP1_64 , "s_and_saveexec_b64", []>; -defm S_OR_SAVEEXEC_B64 : SOP1_64 , "s_or_saveexec_b64", []>; -defm S_XOR_SAVEEXEC_B64 : SOP1_64 , "s_xor_saveexec_b64", []>; -defm S_ANDN2_SAVEEXEC_B64 : SOP1_64 , "s_andn2_saveexec_b64", []>; -defm S_ORN2_SAVEEXEC_B64 : SOP1_64 , "s_orn2_saveexec_b64", []>; -defm S_NAND_SAVEEXEC_B64 : SOP1_64 , "s_nand_saveexec_b64", []>; -defm S_NOR_SAVEEXEC_B64 : SOP1_64 , "s_nor_saveexec_b64", []>; -defm S_XNOR_SAVEEXEC_B64 : SOP1_64 , "s_xnor_saveexec_b64", []>; - -} // End hasSideEffects = 1, Uses = [EXEC], Defs = [EXEC, SCC] - -defm S_QUADMASK_B32 : SOP1_32 , "s_quadmask_b32", []>; -defm S_QUADMASK_B64 : SOP1_64 , "s_quadmask_b64", []>; -defm S_MOVRELS_B32 : SOP1_32 , "s_movrels_b32", []>; -defm S_MOVRELS_B64 : SOP1_64 , "s_movrels_b64", []>; -defm S_MOVRELD_B32 : SOP1_32 , "s_movreld_b32", []>; -defm S_MOVRELD_B64 : SOP1_64 , "s_movreld_b64", []>; -defm S_CBRANCH_JOIN : SOP1_1 , "s_cbranch_join", []>; -defm S_MOV_REGRD_B32 : SOP1_32 , "s_mov_regrd_b32", []>; -let Defs = [SCC] in { - defm S_ABS_I32 : SOP1_32 , "s_abs_i32", []>; -} // End Defs = [SCC] -defm S_MOV_FED_B32 : SOP1_32 , "s_mov_fed_b32", []>; - -//===----------------------------------------------------------------------===// -// SOP2 Instructions -//===----------------------------------------------------------------------===// - -let Defs = [SCC] in { // Carry out goes to SCC -let isCommutable = 1 in { -defm S_ADD_U32 : SOP2_32 , "s_add_u32", []>; -defm S_ADD_I32 : SOP2_32 , "s_add_i32", - [(set i32:$dst, (add SSrc_32:$src0, SSrc_32:$src1))] ->; -} // End isCommutable = 1 - -defm S_SUB_U32 : SOP2_32 , "s_sub_u32", []>; -defm S_SUB_I32 : SOP2_32 , "s_sub_i32", - [(set i32:$dst, (sub SSrc_32:$src0, SSrc_32:$src1))] ->; - -let Uses = [SCC] in { // Carry in comes from SCC -let isCommutable = 1 in { -defm S_ADDC_U32 : SOP2_32 , "s_addc_u32", - [(set i32:$dst, (adde (i32 SSrc_32:$src0), (i32 SSrc_32:$src1)))]>; -} // End isCommutable = 1 - -defm S_SUBB_U32 : SOP2_32 , "s_subb_u32", - [(set i32:$dst, (sube (i32 SSrc_32:$src0), (i32 SSrc_32:$src1)))]>; -} // End Uses = [SCC] - -defm S_MIN_I32 : SOP2_32 , "s_min_i32", - [(set i32:$dst, (smin i32:$src0, i32:$src1))] ->; -defm S_MIN_U32 : SOP2_32 , "s_min_u32", - [(set i32:$dst, (umin i32:$src0, i32:$src1))] ->; -defm S_MAX_I32 : SOP2_32 , "s_max_i32", - [(set i32:$dst, (smax i32:$src0, i32:$src1))] ->; -defm S_MAX_U32 : SOP2_32 , "s_max_u32", - [(set i32:$dst, (umax i32:$src0, i32:$src1))] ->; -} // End Defs = [SCC] - - -let Uses = [SCC] in { - defm S_CSELECT_B32 : SOP2_32 , "s_cselect_b32", []>; - defm S_CSELECT_B64 : SOP2_64 , "s_cselect_b64", []>; -} // End Uses = [SCC] - -let Defs = [SCC] in { -defm S_AND_B32 : SOP2_32 , "s_and_b32", - [(set i32:$dst, (and i32:$src0, i32:$src1))] ->; - -defm S_AND_B64 : SOP2_64 , "s_and_b64", - [(set i64:$dst, (and i64:$src0, i64:$src1))] ->; - -defm S_OR_B32 : SOP2_32 , "s_or_b32", - [(set i32:$dst, (or i32:$src0, i32:$src1))] ->; - -defm S_OR_B64 : SOP2_64 , "s_or_b64", - [(set i64:$dst, (or i64:$src0, i64:$src1))] ->; - -defm S_XOR_B32 : SOP2_32 , "s_xor_b32", - [(set i32:$dst, (xor i32:$src0, i32:$src1))] ->; - -defm S_XOR_B64 : SOP2_64 , "s_xor_b64", - [(set i64:$dst, (xor i64:$src0, i64:$src1))] ->; -defm S_ANDN2_B32 : SOP2_32 , "s_andn2_b32", []>; -defm S_ANDN2_B64 : SOP2_64 , "s_andn2_b64", []>; -defm S_ORN2_B32 : SOP2_32 , "s_orn2_b32", []>; -defm S_ORN2_B64 : SOP2_64 , "s_orn2_b64", []>; -defm S_NAND_B32 : SOP2_32 , "s_nand_b32", []>; -defm S_NAND_B64 : SOP2_64 , "s_nand_b64", []>; -defm S_NOR_B32 : SOP2_32 , "s_nor_b32", []>; -defm S_NOR_B64 : SOP2_64 , "s_nor_b64", []>; -defm S_XNOR_B32 : SOP2_32 , "s_xnor_b32", []>; -defm S_XNOR_B64 : SOP2_64 , "s_xnor_b64", []>; -} // End Defs = [SCC] - -// Use added complexity so these patterns are preferred to the VALU patterns. -let AddedComplexity = 1 in { -let Defs = [SCC] in { - -defm S_LSHL_B32 : SOP2_32 , "s_lshl_b32", - [(set i32:$dst, (shl i32:$src0, i32:$src1))] ->; -defm S_LSHL_B64 : SOP2_64_32 , "s_lshl_b64", - [(set i64:$dst, (shl i64:$src0, i32:$src1))] ->; -defm S_LSHR_B32 : SOP2_32 , "s_lshr_b32", - [(set i32:$dst, (srl i32:$src0, i32:$src1))] ->; -defm S_LSHR_B64 : SOP2_64_32 , "s_lshr_b64", - [(set i64:$dst, (srl i64:$src0, i32:$src1))] ->; -defm S_ASHR_I32 : SOP2_32 , "s_ashr_i32", - [(set i32:$dst, (sra i32:$src0, i32:$src1))] ->; -defm S_ASHR_I64 : SOP2_64_32 , "s_ashr_i64", - [(set i64:$dst, (sra i64:$src0, i32:$src1))] ->; -} // End Defs = [SCC] - -defm S_BFM_B32 : SOP2_32 , "s_bfm_b32", - [(set i32:$dst, (AMDGPUbfm i32:$src0, i32:$src1))]>; -defm S_BFM_B64 : SOP2_64 , "s_bfm_b64", []>; -defm S_MUL_I32 : SOP2_32 , "s_mul_i32", - [(set i32:$dst, (mul i32:$src0, i32:$src1))] ->; - -} // End AddedComplexity = 1 - -let Defs = [SCC] in { -defm S_BFE_U32 : SOP2_32 , "s_bfe_u32", []>; -defm S_BFE_I32 : SOP2_32 , "s_bfe_i32", []>; -defm S_BFE_U64 : SOP2_64 , "s_bfe_u64", []>; -defm S_BFE_I64 : SOP2_64_32 , "s_bfe_i64", []>; -} // End Defs = [SCC] - -let sdst = 0 in { -defm S_CBRANCH_G_FORK : SOP2_m < - sop2<0x2b, 0x29>, "s_cbranch_g_fork", (outs), - (ins SReg_64:$src0, SReg_64:$src1), "s_cbranch_g_fork $src0, $src1", [] ->; -} - -let Defs = [SCC] in { -defm S_ABSDIFF_I32 : SOP2_32 , "s_absdiff_i32", []>; -} // End Defs = [SCC] - -//===----------------------------------------------------------------------===// -// SOPC Instructions -//===----------------------------------------------------------------------===// - -def S_CMP_EQ_I32 : SOPC_32 <0x00000000, "s_cmp_eq_i32">; -def S_CMP_LG_I32 : SOPC_32 <0x00000001, "s_cmp_lg_i32">; -def S_CMP_GT_I32 : SOPC_32 <0x00000002, "s_cmp_gt_i32">; -def S_CMP_GE_I32 : SOPC_32 <0x00000003, "s_cmp_ge_i32">; -def S_CMP_LT_I32 : SOPC_32 <0x00000004, "s_cmp_lt_i32">; -def S_CMP_LE_I32 : SOPC_32 <0x00000005, "s_cmp_le_i32">; -def S_CMP_EQ_U32 : SOPC_32 <0x00000006, "s_cmp_eq_u32">; -def S_CMP_LG_U32 : SOPC_32 <0x00000007, "s_cmp_lg_u32">; -def S_CMP_GT_U32 : SOPC_32 <0x00000008, "s_cmp_gt_u32">; -def S_CMP_GE_U32 : SOPC_32 <0x00000009, "s_cmp_ge_u32">; -def S_CMP_LT_U32 : SOPC_32 <0x0000000a, "s_cmp_lt_u32">; -def S_CMP_LE_U32 : SOPC_32 <0x0000000b, "s_cmp_le_u32">; -////def S_BITCMP0_B32 : SOPC_BITCMP0 <0x0000000c, "s_bitcmp0_b32", []>; -////def S_BITCMP1_B32 : SOPC_BITCMP1 <0x0000000d, "s_bitcmp1_b32", []>; -////def S_BITCMP0_B64 : SOPC_BITCMP0 <0x0000000e, "s_bitcmp0_b64", []>; -////def S_BITCMP1_B64 : SOPC_BITCMP1 <0x0000000f, "s_bitcmp1_b64", []>; -//def S_SETVSKIP : SOPC_ <0x00000010, "s_setvskip", []>; - -//===----------------------------------------------------------------------===// -// SOPK Instructions -//===----------------------------------------------------------------------===// - -let isReMaterializable = 1 in { -defm S_MOVK_I32 : SOPK_32 , "s_movk_i32", []>; -} // End isReMaterializable = 1 -let Uses = [SCC] in { - defm S_CMOVK_I32 : SOPK_32 , "s_cmovk_i32", []>; -} - -let isCompare = 1 in { - -/* -This instruction is disabled for now until we can figure out how to teach -the instruction selector to correctly use the S_CMP* vs V_CMP* -instructions. - -When this instruction is enabled the code generator sometimes produces this -invalid sequence: - -SCC = S_CMPK_EQ_I32 SGPR0, imm -VCC = COPY SCC -VGPR0 = V_CNDMASK VCC, VGPR0, VGPR1 - -defm S_CMPK_EQ_I32 : SOPK_SCC , "s_cmpk_eq_i32", - [(set i1:$dst, (setcc i32:$src0, imm:$src1, SETEQ))] ->; -*/ - -defm S_CMPK_EQ_I32 : SOPK_SCC , "s_cmpk_eq_i32", []>; -defm S_CMPK_LG_I32 : SOPK_SCC , "s_cmpk_lg_i32", []>; -defm S_CMPK_GT_I32 : SOPK_SCC , "s_cmpk_gt_i32", []>; -defm S_CMPK_GE_I32 : SOPK_SCC , "s_cmpk_ge_i32", []>; -defm S_CMPK_LT_I32 : SOPK_SCC , "s_cmpk_lt_i32", []>; -defm S_CMPK_LE_I32 : SOPK_SCC , "s_cmpk_le_i32", []>; -defm S_CMPK_EQ_U32 : SOPK_SCC , "s_cmpk_eq_u32", []>; -defm S_CMPK_LG_U32 : SOPK_SCC , "s_cmpk_lg_u32", []>; -defm S_CMPK_GT_U32 : SOPK_SCC , "s_cmpk_gt_u32", []>; -defm S_CMPK_GE_U32 : SOPK_SCC , "s_cmpk_ge_u32", []>; -defm S_CMPK_LT_U32 : SOPK_SCC , "s_cmpk_lt_u32", []>; -defm S_CMPK_LE_U32 : SOPK_SCC , "s_cmpk_le_u32", []>; -} // End isCompare = 1 - -let Defs = [SCC], isCommutable = 1, DisableEncoding = "$src0", - Constraints = "$sdst = $src0" in { - defm S_ADDK_I32 : SOPK_32TIE , "s_addk_i32", []>; - defm S_MULK_I32 : SOPK_32TIE , "s_mulk_i32", []>; -} - -defm S_CBRANCH_I_FORK : SOPK_m < - sopk<0x11, 0x10>, "s_cbranch_i_fork", (outs), - (ins SReg_64:$sdst, u16imm:$simm16), " $sdst, $simm16" ->; -defm S_GETREG_B32 : SOPK_32 , "s_getreg_b32", []>; -defm S_SETREG_B32 : SOPK_m < - sopk<0x13, 0x12>, "s_setreg_b32", (outs), - (ins SReg_32:$sdst, u16imm:$simm16), " $sdst, $simm16" ->; -// FIXME: Not on SI? -//defm S_GETREG_REGRD_B32 : SOPK_32 , "s_getreg_regrd_b32", []>; -defm S_SETREG_IMM32_B32 : SOPK_IMM32 < - sopk<0x15, 0x14>, "s_setreg_imm32_b32", (outs), - (ins i32imm:$imm, u16imm:$simm16), " $imm, $simm16" ->; - -//===----------------------------------------------------------------------===// -// SOPP Instructions -//===----------------------------------------------------------------------===// - -def S_NOP : SOPP <0x00000000, (ins i16imm:$simm16), "s_nop $simm16">; - -let isTerminator = 1 in { - -def S_ENDPGM : SOPP <0x00000001, (ins), "s_endpgm", - [(IL_retflag)]> { - let simm16 = 0; - let isBarrier = 1; - let hasCtrlDep = 1; -} - -let isBranch = 1 in { -def S_BRANCH : SOPP < - 0x00000002, (ins sopp_brtarget:$simm16), "s_branch $simm16", - [(br bb:$simm16)]> { - let isBarrier = 1; -} - -let DisableEncoding = "$scc" in { -def S_CBRANCH_SCC0 : SOPP < - 0x00000004, (ins sopp_brtarget:$simm16, SCCReg:$scc), - "s_cbranch_scc0 $simm16" ->; -def S_CBRANCH_SCC1 : SOPP < - 0x00000005, (ins sopp_brtarget:$simm16, SCCReg:$scc), - "s_cbranch_scc1 $simm16" ->; -} // End DisableEncoding = "$scc" - -def S_CBRANCH_VCCZ : SOPP < - 0x00000006, (ins sopp_brtarget:$simm16, VCCReg:$vcc), - "s_cbranch_vccz $simm16" ->; -def S_CBRANCH_VCCNZ : SOPP < - 0x00000007, (ins sopp_brtarget:$simm16, VCCReg:$vcc), - "s_cbranch_vccnz $simm16" ->; - -let DisableEncoding = "$exec" in { -def S_CBRANCH_EXECZ : SOPP < - 0x00000008, (ins sopp_brtarget:$simm16, EXECReg:$exec), - "s_cbranch_execz $simm16" ->; -def S_CBRANCH_EXECNZ : SOPP < - 0x00000009, (ins sopp_brtarget:$simm16, EXECReg:$exec), - "s_cbranch_execnz $simm16" ->; -} // End DisableEncoding = "$exec" - - -} // End isBranch = 1 -} // End isTerminator = 1 - -let hasSideEffects = 1 in { -def S_BARRIER : SOPP <0x0000000a, (ins), "s_barrier", - [(int_AMDGPU_barrier_local)] -> { - let simm16 = 0; - let isBarrier = 1; - let hasCtrlDep = 1; - let mayLoad = 1; - let mayStore = 1; -} - -def S_WAITCNT : SOPP <0x0000000c, (ins WAIT_FLAG:$simm16), "s_waitcnt $simm16">; -def S_SETHALT : SOPP <0x0000000d, (ins i16imm:$simm16), "s_sethalt $simm16">; -def S_SLEEP : SOPP <0x0000000e, (ins i16imm:$simm16), "s_sleep $simm16">; -def S_SETPRIO : SOPP <0x0000000f, (ins i16imm:$sim16), "s_setprio $sim16">; - -let Uses = [EXEC, M0] in { - def S_SENDMSG : SOPP <0x00000010, (ins SendMsgImm:$simm16), "s_sendmsg $simm16", - [(AMDGPUsendmsg (i32 imm:$simm16))] - >; -} // End Uses = [EXEC, M0] - -def S_SENDMSGHALT : SOPP <0x00000011, (ins i16imm:$simm16), "s_sendmsghalt $simm16">; -def S_TRAP : SOPP <0x00000012, (ins i16imm:$simm16), "s_trap $simm16">; -def S_ICACHE_INV : SOPP <0x00000013, (ins), "s_icache_inv"> { - let simm16 = 0; -} -def S_INCPERFLEVEL : SOPP <0x00000014, (ins i16imm:$simm16), "s_incperflevel $simm16">; -def S_DECPERFLEVEL : SOPP <0x00000015, (ins i16imm:$simm16), "s_decperflevel $simm16">; -def S_TTRACEDATA : SOPP <0x00000016, (ins), "s_ttracedata"> { - let simm16 = 0; -} -} // End hasSideEffects - -//===----------------------------------------------------------------------===// -// VOPC Instructions -//===----------------------------------------------------------------------===// - -let isCompare = 1, isCommutable = 1 in { - -defm V_CMP_F_F32 : VOPC_F32 , "v_cmp_f_f32">; -defm V_CMP_LT_F32 : VOPC_F32 , "v_cmp_lt_f32", COND_OLT, "v_cmp_gt_f32">; -defm V_CMP_EQ_F32 : VOPC_F32 , "v_cmp_eq_f32", COND_OEQ>; -defm V_CMP_LE_F32 : VOPC_F32 , "v_cmp_le_f32", COND_OLE, "v_cmp_ge_f32">; -defm V_CMP_GT_F32 : VOPC_F32 , "v_cmp_gt_f32", COND_OGT>; -defm V_CMP_LG_F32 : VOPC_F32 , "v_cmp_lg_f32", COND_ONE>; -defm V_CMP_GE_F32 : VOPC_F32 , "v_cmp_ge_f32", COND_OGE>; -defm V_CMP_O_F32 : VOPC_F32 , "v_cmp_o_f32", COND_O>; -defm V_CMP_U_F32 : VOPC_F32 , "v_cmp_u_f32", COND_UO>; -defm V_CMP_NGE_F32 : VOPC_F32 , "v_cmp_nge_f32", COND_ULT, "v_cmp_nle_f32">; -defm V_CMP_NLG_F32 : VOPC_F32 , "v_cmp_nlg_f32", COND_UEQ>; -defm V_CMP_NGT_F32 : VOPC_F32 , "v_cmp_ngt_f32", COND_ULE, "v_cmp_nlt_f32">; -defm V_CMP_NLE_F32 : VOPC_F32 , "v_cmp_nle_f32", COND_UGT>; -defm V_CMP_NEQ_F32 : VOPC_F32 , "v_cmp_neq_f32", COND_UNE>; -defm V_CMP_NLT_F32 : VOPC_F32 , "v_cmp_nlt_f32", COND_UGE>; -defm V_CMP_TRU_F32 : VOPC_F32 , "v_cmp_tru_f32">; - - -defm V_CMPX_F_F32 : VOPCX_F32 , "v_cmpx_f_f32">; -defm V_CMPX_LT_F32 : VOPCX_F32 , "v_cmpx_lt_f32", "v_cmpx_gt_f32">; -defm V_CMPX_EQ_F32 : VOPCX_F32 , "v_cmpx_eq_f32">; -defm V_CMPX_LE_F32 : VOPCX_F32 , "v_cmpx_le_f32", "v_cmpx_ge_f32">; -defm V_CMPX_GT_F32 : VOPCX_F32 , "v_cmpx_gt_f32">; -defm V_CMPX_LG_F32 : VOPCX_F32 , "v_cmpx_lg_f32">; -defm V_CMPX_GE_F32 : VOPCX_F32 , "v_cmpx_ge_f32">; -defm V_CMPX_O_F32 : VOPCX_F32 , "v_cmpx_o_f32">; -defm V_CMPX_U_F32 : VOPCX_F32 , "v_cmpx_u_f32">; -defm V_CMPX_NGE_F32 : VOPCX_F32 , "v_cmpx_nge_f32">; -defm V_CMPX_NLG_F32 : VOPCX_F32 , "v_cmpx_nlg_f32">; -defm V_CMPX_NGT_F32 : VOPCX_F32 , "v_cmpx_ngt_f32">; -defm V_CMPX_NLE_F32 : VOPCX_F32 , "v_cmpx_nle_f32">; -defm V_CMPX_NEQ_F32 : VOPCX_F32 , "v_cmpx_neq_f32">; -defm V_CMPX_NLT_F32 : VOPCX_F32 , "v_cmpx_nlt_f32">; -defm V_CMPX_TRU_F32 : VOPCX_F32 , "v_cmpx_tru_f32">; - - -defm V_CMP_F_F64 : VOPC_F64 , "v_cmp_f_f64">; -defm V_CMP_LT_F64 : VOPC_F64 , "v_cmp_lt_f64", COND_OLT, "v_cmp_gt_f64">; -defm V_CMP_EQ_F64 : VOPC_F64 , "v_cmp_eq_f64", COND_OEQ>; -defm V_CMP_LE_F64 : VOPC_F64 , "v_cmp_le_f64", COND_OLE, "v_cmp_ge_f64">; -defm V_CMP_GT_F64 : VOPC_F64 , "v_cmp_gt_f64", COND_OGT>; -defm V_CMP_LG_F64 : VOPC_F64 , "v_cmp_lg_f64", COND_ONE>; -defm V_CMP_GE_F64 : VOPC_F64 , "v_cmp_ge_f64", COND_OGE>; -defm V_CMP_O_F64 : VOPC_F64 , "v_cmp_o_f64", COND_O>; -defm V_CMP_U_F64 : VOPC_F64 , "v_cmp_u_f64", COND_UO>; -defm V_CMP_NGE_F64 : VOPC_F64 , "v_cmp_nge_f64", COND_ULT, "v_cmp_nle_f64">; -defm V_CMP_NLG_F64 : VOPC_F64 , "v_cmp_nlg_f64", COND_UEQ>; -defm V_CMP_NGT_F64 : VOPC_F64 , "v_cmp_ngt_f64", COND_ULE, "v_cmp_nlt_f64">; -defm V_CMP_NLE_F64 : VOPC_F64 , "v_cmp_nle_f64", COND_UGT>; -defm V_CMP_NEQ_F64 : VOPC_F64 , "v_cmp_neq_f64", COND_UNE>; -defm V_CMP_NLT_F64 : VOPC_F64 , "v_cmp_nlt_f64", COND_UGE>; -defm V_CMP_TRU_F64 : VOPC_F64 , "v_cmp_tru_f64">; - - -defm V_CMPX_F_F64 : VOPCX_F64 , "v_cmpx_f_f64">; -defm V_CMPX_LT_F64 : VOPCX_F64 , "v_cmpx_lt_f64", "v_cmpx_gt_f64">; -defm V_CMPX_EQ_F64 : VOPCX_F64 , "v_cmpx_eq_f64">; -defm V_CMPX_LE_F64 : VOPCX_F64 , "v_cmpx_le_f64", "v_cmpx_ge_f64">; -defm V_CMPX_GT_F64 : VOPCX_F64 , "v_cmpx_gt_f64">; -defm V_CMPX_LG_F64 : VOPCX_F64 , "v_cmpx_lg_f64">; -defm V_CMPX_GE_F64 : VOPCX_F64 , "v_cmpx_ge_f64">; -defm V_CMPX_O_F64 : VOPCX_F64 , "v_cmpx_o_f64">; -defm V_CMPX_U_F64 : VOPCX_F64 , "v_cmpx_u_f64">; -defm V_CMPX_NGE_F64 : VOPCX_F64 , "v_cmpx_nge_f64", "v_cmpx_nle_f64">; -defm V_CMPX_NLG_F64 : VOPCX_F64 , "v_cmpx_nlg_f64">; -defm V_CMPX_NGT_F64 : VOPCX_F64 , "v_cmpx_ngt_f64", "v_cmpx_nlt_f64">; -defm V_CMPX_NLE_F64 : VOPCX_F64 , "v_cmpx_nle_f64">; -defm V_CMPX_NEQ_F64 : VOPCX_F64 , "v_cmpx_neq_f64">; -defm V_CMPX_NLT_F64 : VOPCX_F64 , "v_cmpx_nlt_f64">; -defm V_CMPX_TRU_F64 : VOPCX_F64 , "v_cmpx_tru_f64">; - - -let SubtargetPredicate = isSICI in { - -defm V_CMPS_F_F32 : VOPC_F32 , "v_cmps_f_f32">; -defm V_CMPS_LT_F32 : VOPC_F32 , "v_cmps_lt_f32", COND_NULL, "v_cmps_gt_f32">; -defm V_CMPS_EQ_F32 : VOPC_F32 , "v_cmps_eq_f32">; -defm V_CMPS_LE_F32 : VOPC_F32 , "v_cmps_le_f32", COND_NULL, "v_cmps_ge_f32">; -defm V_CMPS_GT_F32 : VOPC_F32 , "v_cmps_gt_f32">; -defm V_CMPS_LG_F32 : VOPC_F32 , "v_cmps_lg_f32">; -defm V_CMPS_GE_F32 : VOPC_F32 , "v_cmps_ge_f32">; -defm V_CMPS_O_F32 : VOPC_F32 , "v_cmps_o_f32">; -defm V_CMPS_U_F32 : VOPC_F32 , "v_cmps_u_f32">; -defm V_CMPS_NGE_F32 : VOPC_F32 , "v_cmps_nge_f32", COND_NULL, "v_cmps_nle_f32">; -defm V_CMPS_NLG_F32 : VOPC_F32 , "v_cmps_nlg_f32">; -defm V_CMPS_NGT_F32 : VOPC_F32 , "v_cmps_ngt_f32", COND_NULL, "v_cmps_nlt_f32">; -defm V_CMPS_NLE_F32 : VOPC_F32 , "v_cmps_nle_f32">; -defm V_CMPS_NEQ_F32 : VOPC_F32 , "v_cmps_neq_f32">; -defm V_CMPS_NLT_F32 : VOPC_F32 , "v_cmps_nlt_f32">; -defm V_CMPS_TRU_F32 : VOPC_F32 , "v_cmps_tru_f32">; - - -defm V_CMPSX_F_F32 : VOPCX_F32 , "v_cmpsx_f_f32">; -defm V_CMPSX_LT_F32 : VOPCX_F32 , "v_cmpsx_lt_f32", "v_cmpsx_gt_f32">; -defm V_CMPSX_EQ_F32 : VOPCX_F32 , "v_cmpsx_eq_f32">; -defm V_CMPSX_LE_F32 : VOPCX_F32 , "v_cmpsx_le_f32", "v_cmpsx_ge_f32">; -defm V_CMPSX_GT_F32 : VOPCX_F32 , "v_cmpsx_gt_f32">; -defm V_CMPSX_LG_F32 : VOPCX_F32 , "v_cmpsx_lg_f32">; -defm V_CMPSX_GE_F32 : VOPCX_F32 , "v_cmpsx_ge_f32">; -defm V_CMPSX_O_F32 : VOPCX_F32 , "v_cmpsx_o_f32">; -defm V_CMPSX_U_F32 : VOPCX_F32 , "v_cmpsx_u_f32">; -defm V_CMPSX_NGE_F32 : VOPCX_F32 , "v_cmpsx_nge_f32", "v_cmpsx_nle_f32">; -defm V_CMPSX_NLG_F32 : VOPCX_F32 , "v_cmpsx_nlg_f32">; -defm V_CMPSX_NGT_F32 : VOPCX_F32 , "v_cmpsx_ngt_f32", "v_cmpsx_nlt_f32">; -defm V_CMPSX_NLE_F32 : VOPCX_F32 , "v_cmpsx_nle_f32">; -defm V_CMPSX_NEQ_F32 : VOPCX_F32 , "v_cmpsx_neq_f32">; -defm V_CMPSX_NLT_F32 : VOPCX_F32 , "v_cmpsx_nlt_f32">; -defm V_CMPSX_TRU_F32 : VOPCX_F32 , "v_cmpsx_tru_f32">; - - -defm V_CMPS_F_F64 : VOPC_F64 , "v_cmps_f_f64">; -defm V_CMPS_LT_F64 : VOPC_F64 , "v_cmps_lt_f64", COND_NULL, "v_cmps_gt_f64">; -defm V_CMPS_EQ_F64 : VOPC_F64 , "v_cmps_eq_f64">; -defm V_CMPS_LE_F64 : VOPC_F64 , "v_cmps_le_f64", COND_NULL, "v_cmps_ge_f64">; -defm V_CMPS_GT_F64 : VOPC_F64 , "v_cmps_gt_f64">; -defm V_CMPS_LG_F64 : VOPC_F64 , "v_cmps_lg_f64">; -defm V_CMPS_GE_F64 : VOPC_F64 , "v_cmps_ge_f64">; -defm V_CMPS_O_F64 : VOPC_F64 , "v_cmps_o_f64">; -defm V_CMPS_U_F64 : VOPC_F64 , "v_cmps_u_f64">; -defm V_CMPS_NGE_F64 : VOPC_F64 , "v_cmps_nge_f64", COND_NULL, "v_cmps_nle_f64">; -defm V_CMPS_NLG_F64 : VOPC_F64 , "v_cmps_nlg_f64">; -defm V_CMPS_NGT_F64 : VOPC_F64 , "v_cmps_ngt_f64", COND_NULL, "v_cmps_nlt_f64">; -defm V_CMPS_NLE_F64 : VOPC_F64 , "v_cmps_nle_f64">; -defm V_CMPS_NEQ_F64 : VOPC_F64 , "v_cmps_neq_f64">; -defm V_CMPS_NLT_F64 : VOPC_F64 , "v_cmps_nlt_f64">; -defm V_CMPS_TRU_F64 : VOPC_F64 , "v_cmps_tru_f64">; - - -defm V_CMPSX_F_F64 : VOPCX_F64 , "v_cmpsx_f_f64">; -defm V_CMPSX_LT_F64 : VOPCX_F64 , "v_cmpsx_lt_f64", "v_cmpsx_gt_f64">; -defm V_CMPSX_EQ_F64 : VOPCX_F64 , "v_cmpsx_eq_f64">; -defm V_CMPSX_LE_F64 : VOPCX_F64 , "v_cmpsx_le_f64", "v_cmpsx_ge_f64">; -defm V_CMPSX_GT_F64 : VOPCX_F64 , "v_cmpsx_gt_f64">; -defm V_CMPSX_LG_F64 : VOPCX_F64 , "v_cmpsx_lg_f64">; -defm V_CMPSX_GE_F64 : VOPCX_F64 , "v_cmpsx_ge_f64">; -defm V_CMPSX_O_F64 : VOPCX_F64 , "v_cmpsx_o_f64">; -defm V_CMPSX_U_F64 : VOPCX_F64 , "v_cmpsx_u_f64">; -defm V_CMPSX_NGE_F64 : VOPCX_F64 , "v_cmpsx_nge_f64", "v_cmpsx_nle_f64">; -defm V_CMPSX_NLG_F64 : VOPCX_F64 , "v_cmpsx_nlg_f64">; -defm V_CMPSX_NGT_F64 : VOPCX_F64 , "v_cmpsx_ngt_f64", "v_cmpsx_nlt_f64">; -defm V_CMPSX_NLE_F64 : VOPCX_F64 , "v_cmpsx_nle_f64">; -defm V_CMPSX_NEQ_F64 : VOPCX_F64 , "v_cmpsx_neq_f64">; -defm V_CMPSX_NLT_F64 : VOPCX_F64 , "v_cmpsx_nlt_f64">; -defm V_CMPSX_TRU_F64 : VOPCX_F64 , "v_cmpsx_tru_f64">; - -} // End SubtargetPredicate = isSICI - -defm V_CMP_F_I32 : VOPC_I32 , "v_cmp_f_i32">; -defm V_CMP_LT_I32 : VOPC_I32 , "v_cmp_lt_i32", COND_SLT, "v_cmp_gt_i32">; -defm V_CMP_EQ_I32 : VOPC_I32 , "v_cmp_eq_i32", COND_EQ>; -defm V_CMP_LE_I32 : VOPC_I32 , "v_cmp_le_i32", COND_SLE, "v_cmp_ge_i32">; -defm V_CMP_GT_I32 : VOPC_I32 , "v_cmp_gt_i32", COND_SGT>; -defm V_CMP_NE_I32 : VOPC_I32 , "v_cmp_ne_i32", COND_NE>; -defm V_CMP_GE_I32 : VOPC_I32 , "v_cmp_ge_i32", COND_SGE>; -defm V_CMP_T_I32 : VOPC_I32 , "v_cmp_t_i32">; - - -defm V_CMPX_F_I32 : VOPCX_I32 , "v_cmpx_f_i32">; -defm V_CMPX_LT_I32 : VOPCX_I32 , "v_cmpx_lt_i32", "v_cmpx_gt_i32">; -defm V_CMPX_EQ_I32 : VOPCX_I32 , "v_cmpx_eq_i32">; -defm V_CMPX_LE_I32 : VOPCX_I32 , "v_cmpx_le_i32", "v_cmpx_ge_i32">; -defm V_CMPX_GT_I32 : VOPCX_I32 , "v_cmpx_gt_i32">; -defm V_CMPX_NE_I32 : VOPCX_I32 , "v_cmpx_ne_i32">; -defm V_CMPX_GE_I32 : VOPCX_I32 , "v_cmpx_ge_i32">; -defm V_CMPX_T_I32 : VOPCX_I32 , "v_cmpx_t_i32">; - - -defm V_CMP_F_I64 : VOPC_I64 , "v_cmp_f_i64">; -defm V_CMP_LT_I64 : VOPC_I64 , "v_cmp_lt_i64", COND_SLT, "v_cmp_gt_i64">; -defm V_CMP_EQ_I64 : VOPC_I64 , "v_cmp_eq_i64", COND_EQ>; -defm V_CMP_LE_I64 : VOPC_I64 , "v_cmp_le_i64", COND_SLE, "v_cmp_ge_i64">; -defm V_CMP_GT_I64 : VOPC_I64 , "v_cmp_gt_i64", COND_SGT>; -defm V_CMP_NE_I64 : VOPC_I64 , "v_cmp_ne_i64", COND_NE>; -defm V_CMP_GE_I64 : VOPC_I64 , "v_cmp_ge_i64", COND_SGE>; -defm V_CMP_T_I64 : VOPC_I64 , "v_cmp_t_i64">; - - -defm V_CMPX_F_I64 : VOPCX_I64 , "v_cmpx_f_i64">; -defm V_CMPX_LT_I64 : VOPCX_I64 , "v_cmpx_lt_i64", "v_cmpx_gt_i64">; -defm V_CMPX_EQ_I64 : VOPCX_I64 , "v_cmpx_eq_i64">; -defm V_CMPX_LE_I64 : VOPCX_I64 , "v_cmpx_le_i64", "v_cmpx_ge_i64">; -defm V_CMPX_GT_I64 : VOPCX_I64 , "v_cmpx_gt_i64">; -defm V_CMPX_NE_I64 : VOPCX_I64 , "v_cmpx_ne_i64">; -defm V_CMPX_GE_I64 : VOPCX_I64 , "v_cmpx_ge_i64">; -defm V_CMPX_T_I64 : VOPCX_I64 , "v_cmpx_t_i64">; - - -defm V_CMP_F_U32 : VOPC_I32 , "v_cmp_f_u32">; -defm V_CMP_LT_U32 : VOPC_I32 , "v_cmp_lt_u32", COND_ULT, "v_cmp_gt_u32">; -defm V_CMP_EQ_U32 : VOPC_I32 , "v_cmp_eq_u32", COND_EQ>; -defm V_CMP_LE_U32 : VOPC_I32 , "v_cmp_le_u32", COND_ULE, "v_cmp_ge_u32">; -defm V_CMP_GT_U32 : VOPC_I32 , "v_cmp_gt_u32", COND_UGT>; -defm V_CMP_NE_U32 : VOPC_I32 , "v_cmp_ne_u32", COND_NE>; -defm V_CMP_GE_U32 : VOPC_I32 , "v_cmp_ge_u32", COND_UGE>; -defm V_CMP_T_U32 : VOPC_I32 , "v_cmp_t_u32">; - - -defm V_CMPX_F_U32 : VOPCX_I32 , "v_cmpx_f_u32">; -defm V_CMPX_LT_U32 : VOPCX_I32 , "v_cmpx_lt_u32", "v_cmpx_gt_u32">; -defm V_CMPX_EQ_U32 : VOPCX_I32 , "v_cmpx_eq_u32">; -defm V_CMPX_LE_U32 : VOPCX_I32 , "v_cmpx_le_u32", "v_cmpx_le_u32">; -defm V_CMPX_GT_U32 : VOPCX_I32 , "v_cmpx_gt_u32">; -defm V_CMPX_NE_U32 : VOPCX_I32 , "v_cmpx_ne_u32">; -defm V_CMPX_GE_U32 : VOPCX_I32 , "v_cmpx_ge_u32">; -defm V_CMPX_T_U32 : VOPCX_I32 , "v_cmpx_t_u32">; - - -defm V_CMP_F_U64 : VOPC_I64 , "v_cmp_f_u64">; -defm V_CMP_LT_U64 : VOPC_I64 , "v_cmp_lt_u64", COND_ULT, "v_cmp_gt_u64">; -defm V_CMP_EQ_U64 : VOPC_I64 , "v_cmp_eq_u64", COND_EQ>; -defm V_CMP_LE_U64 : VOPC_I64 , "v_cmp_le_u64", COND_ULE, "v_cmp_ge_u64">; -defm V_CMP_GT_U64 : VOPC_I64 , "v_cmp_gt_u64", COND_UGT>; -defm V_CMP_NE_U64 : VOPC_I64 , "v_cmp_ne_u64", COND_NE>; -defm V_CMP_GE_U64 : VOPC_I64 , "v_cmp_ge_u64", COND_UGE>; -defm V_CMP_T_U64 : VOPC_I64 , "v_cmp_t_u64">; - -defm V_CMPX_F_U64 : VOPCX_I64 , "v_cmpx_f_u64">; -defm V_CMPX_LT_U64 : VOPCX_I64 , "v_cmpx_lt_u64", "v_cmpx_gt_u64">; -defm V_CMPX_EQ_U64 : VOPCX_I64 , "v_cmpx_eq_u64">; -defm V_CMPX_LE_U64 : VOPCX_I64 , "v_cmpx_le_u64", "v_cmpx_ge_u64">; -defm V_CMPX_GT_U64 : VOPCX_I64 , "v_cmpx_gt_u64">; -defm V_CMPX_NE_U64 : VOPCX_I64 , "v_cmpx_ne_u64">; -defm V_CMPX_GE_U64 : VOPCX_I64 , "v_cmpx_ge_u64">; -defm V_CMPX_T_U64 : VOPCX_I64 , "v_cmpx_t_u64">; - -} // End isCompare = 1, isCommutable = 1 - -defm V_CMP_CLASS_F32 : VOPC_CLASS_F32 , "v_cmp_class_f32">; -defm V_CMPX_CLASS_F32 : VOPCX_CLASS_F32 , "v_cmpx_class_f32">; -defm V_CMP_CLASS_F64 : VOPC_CLASS_F64 , "v_cmp_class_f64">; -defm V_CMPX_CLASS_F64 : VOPCX_CLASS_F64 , "v_cmpx_class_f64">; - -//===----------------------------------------------------------------------===// -// DS Instructions -//===----------------------------------------------------------------------===// - -defm DS_ADD_U32 : DS_1A1D_NORET <0x0, "ds_add_u32", VGPR_32>; -defm DS_SUB_U32 : DS_1A1D_NORET <0x1, "ds_sub_u32", VGPR_32>; -defm DS_RSUB_U32 : DS_1A1D_NORET <0x2, "ds_rsub_u32", VGPR_32>; -defm DS_INC_U32 : DS_1A1D_NORET <0x3, "ds_inc_u32", VGPR_32>; -defm DS_DEC_U32 : DS_1A1D_NORET <0x4, "ds_dec_u32", VGPR_32>; -defm DS_MIN_I32 : DS_1A1D_NORET <0x5, "ds_min_i32", VGPR_32>; -defm DS_MAX_I32 : DS_1A1D_NORET <0x6, "ds_max_i32", VGPR_32>; -defm DS_MIN_U32 : DS_1A1D_NORET <0x7, "ds_min_u32", VGPR_32>; -defm DS_MAX_U32 : DS_1A1D_NORET <0x8, "ds_max_u32", VGPR_32>; -defm DS_AND_B32 : DS_1A1D_NORET <0x9, "ds_and_b32", VGPR_32>; -defm DS_OR_B32 : DS_1A1D_NORET <0xa, "ds_or_b32", VGPR_32>; -defm DS_XOR_B32 : DS_1A1D_NORET <0xb, "ds_xor_b32", VGPR_32>; -defm DS_MSKOR_B32 : DS_1A2D_NORET <0xc, "ds_mskor_b32", VGPR_32>; -let mayLoad = 0 in { -defm DS_WRITE_B32 : DS_1A1D_NORET <0xd, "ds_write_b32", VGPR_32>; -defm DS_WRITE2_B32 : DS_1A1D_Off8_NORET <0xe, "ds_write2_b32", VGPR_32>; -defm DS_WRITE2ST64_B32 : DS_1A1D_Off8_NORET <0xf, "ds_write2st64_b32", VGPR_32>; -} -defm DS_CMPST_B32 : DS_1A2D_NORET <0x10, "ds_cmpst_b32", VGPR_32>; -defm DS_CMPST_F32 : DS_1A2D_NORET <0x11, "ds_cmpst_f32", VGPR_32>; -defm DS_MIN_F32 : DS_1A2D_NORET <0x12, "ds_min_f32", VGPR_32>; -defm DS_MAX_F32 : DS_1A2D_NORET <0x13, "ds_max_f32", VGPR_32>; - -defm DS_GWS_INIT : DS_1A_GDS <0x19, "ds_gws_init">; -defm DS_GWS_SEMA_V : DS_1A_GDS <0x1a, "ds_gws_sema_v">; -defm DS_GWS_SEMA_BR : DS_1A_GDS <0x1b, "ds_gws_sema_br">; -defm DS_GWS_SEMA_P : DS_1A_GDS <0x1c, "ds_gws_sema_p">; -defm DS_GWS_BARRIER : DS_1A_GDS <0x1d, "ds_gws_barrier">; -let mayLoad = 0 in { -defm DS_WRITE_B8 : DS_1A1D_NORET <0x1e, "ds_write_b8", VGPR_32>; -defm DS_WRITE_B16 : DS_1A1D_NORET <0x1f, "ds_write_b16", VGPR_32>; -} -defm DS_ADD_RTN_U32 : DS_1A1D_RET <0x20, "ds_add_rtn_u32", VGPR_32, "ds_add_u32">; -defm DS_SUB_RTN_U32 : DS_1A1D_RET <0x21, "ds_sub_rtn_u32", VGPR_32, "ds_sub_u32">; -defm DS_RSUB_RTN_U32 : DS_1A1D_RET <0x22, "ds_rsub_rtn_u32", VGPR_32, "ds_rsub_u32">; -defm DS_INC_RTN_U32 : DS_1A1D_RET <0x23, "ds_inc_rtn_u32", VGPR_32, "ds_inc_u32">; -defm DS_DEC_RTN_U32 : DS_1A1D_RET <0x24, "ds_dec_rtn_u32", VGPR_32, "ds_dec_u32">; -defm DS_MIN_RTN_I32 : DS_1A1D_RET <0x25, "ds_min_rtn_i32", VGPR_32, "ds_min_i32">; -defm DS_MAX_RTN_I32 : DS_1A1D_RET <0x26, "ds_max_rtn_i32", VGPR_32, "ds_max_i32">; -defm DS_MIN_RTN_U32 : DS_1A1D_RET <0x27, "ds_min_rtn_u32", VGPR_32, "ds_min_u32">; -defm DS_MAX_RTN_U32 : DS_1A1D_RET <0x28, "ds_max_rtn_u32", VGPR_32, "ds_max_u32">; -defm DS_AND_RTN_B32 : DS_1A1D_RET <0x29, "ds_and_rtn_b32", VGPR_32, "ds_and_b32">; -defm DS_OR_RTN_B32 : DS_1A1D_RET <0x2a, "ds_or_rtn_b32", VGPR_32, "ds_or_b32">; -defm DS_XOR_RTN_B32 : DS_1A1D_RET <0x2b, "ds_xor_rtn_b32", VGPR_32, "ds_xor_b32">; -defm DS_MSKOR_RTN_B32 : DS_1A2D_RET <0x2c, "ds_mskor_rtn_b32", VGPR_32, "ds_mskor_b32">; -defm DS_WRXCHG_RTN_B32 : DS_1A1D_RET <0x2d, "ds_wrxchg_rtn_b32", VGPR_32>; -defm DS_WRXCHG2_RTN_B32 : DS_1A2D_RET < - 0x2e, "ds_wrxchg2_rtn_b32", VReg_64, "", VGPR_32 ->; -defm DS_WRXCHG2ST64_RTN_B32 : DS_1A2D_RET < - 0x2f, "ds_wrxchg2st64_rtn_b32", VReg_64, "", VGPR_32 ->; -defm DS_CMPST_RTN_B32 : DS_1A2D_RET <0x30, "ds_cmpst_rtn_b32", VGPR_32, "ds_cmpst_b32">; -defm DS_CMPST_RTN_F32 : DS_1A2D_RET <0x31, "ds_cmpst_rtn_f32", VGPR_32, "ds_cmpst_f32">; -defm DS_MIN_RTN_F32 : DS_1A2D_RET <0x32, "ds_min_rtn_f32", VGPR_32, "ds_min_f32">; -defm DS_MAX_RTN_F32 : DS_1A2D_RET <0x33, "ds_max_rtn_f32", VGPR_32, "ds_max_f32">; -let SubtargetPredicate = isCI in { -defm DS_WRAP_RTN_F32 : DS_1A1D_RET <0x34, "ds_wrap_rtn_f32", VGPR_32, "ds_wrap_f32">; -} // End isCI -defm DS_SWIZZLE_B32 : DS_1A_RET <0x35, "ds_swizzle_b32", VGPR_32>; -let mayStore = 0 in { -defm DS_READ_B32 : DS_1A_RET <0x36, "ds_read_b32", VGPR_32>; -defm DS_READ2_B32 : DS_1A_Off8_RET <0x37, "ds_read2_b32", VReg_64>; -defm DS_READ2ST64_B32 : DS_1A_Off8_RET <0x38, "ds_read2st64_b32", VReg_64>; -defm DS_READ_I8 : DS_1A_RET <0x39, "ds_read_i8", VGPR_32>; -defm DS_READ_U8 : DS_1A_RET <0x3a, "ds_read_u8", VGPR_32>; -defm DS_READ_I16 : DS_1A_RET <0x3b, "ds_read_i16", VGPR_32>; -defm DS_READ_U16 : DS_1A_RET <0x3c, "ds_read_u16", VGPR_32>; -} -defm DS_CONSUME : DS_0A_RET <0x3d, "ds_consume">; -defm DS_APPEND : DS_0A_RET <0x3e, "ds_append">; -defm DS_ORDERED_COUNT : DS_1A_RET_GDS <0x3f, "ds_ordered_count">; -defm DS_ADD_U64 : DS_1A1D_NORET <0x40, "ds_add_u64", VReg_64>; -defm DS_SUB_U64 : DS_1A1D_NORET <0x41, "ds_sub_u64", VReg_64>; -defm DS_RSUB_U64 : DS_1A1D_NORET <0x42, "ds_rsub_u64", VReg_64>; -defm DS_INC_U64 : DS_1A1D_NORET <0x43, "ds_inc_u64", VReg_64>; -defm DS_DEC_U64 : DS_1A1D_NORET <0x44, "ds_dec_u64", VReg_64>; -defm DS_MIN_I64 : DS_1A1D_NORET <0x45, "ds_min_i64", VReg_64>; -defm DS_MAX_I64 : DS_1A1D_NORET <0x46, "ds_max_i64", VReg_64>; -defm DS_MIN_U64 : DS_1A1D_NORET <0x47, "ds_min_u64", VReg_64>; -defm DS_MAX_U64 : DS_1A1D_NORET <0x48, "ds_max_u64", VReg_64>; -defm DS_AND_B64 : DS_1A1D_NORET <0x49, "ds_and_b64", VReg_64>; -defm DS_OR_B64 : DS_1A1D_NORET <0x4a, "ds_or_b64", VReg_64>; -defm DS_XOR_B64 : DS_1A1D_NORET <0x4b, "ds_xor_b64", VReg_64>; -defm DS_MSKOR_B64 : DS_1A2D_NORET <0x4c, "ds_mskor_b64", VReg_64>; -let mayLoad = 0 in { -defm DS_WRITE_B64 : DS_1A1D_NORET <0x4d, "ds_write_b64", VReg_64>; -defm DS_WRITE2_B64 : DS_1A1D_Off8_NORET <0x4E, "ds_write2_b64", VReg_64>; -defm DS_WRITE2ST64_B64 : DS_1A1D_Off8_NORET <0x4f, "ds_write2st64_b64", VReg_64>; -} -defm DS_CMPST_B64 : DS_1A2D_NORET <0x50, "ds_cmpst_b64", VReg_64>; -defm DS_CMPST_F64 : DS_1A2D_NORET <0x51, "ds_cmpst_f64", VReg_64>; -defm DS_MIN_F64 : DS_1A1D_NORET <0x52, "ds_min_f64", VReg_64>; -defm DS_MAX_F64 : DS_1A1D_NORET <0x53, "ds_max_f64", VReg_64>; - -defm DS_ADD_RTN_U64 : DS_1A1D_RET <0x60, "ds_add_rtn_u64", VReg_64, "ds_add_u64">; -defm DS_SUB_RTN_U64 : DS_1A1D_RET <0x61, "ds_sub_rtn_u64", VReg_64, "ds_sub_u64">; -defm DS_RSUB_RTN_U64 : DS_1A1D_RET <0x62, "ds_rsub_rtn_u64", VReg_64, "ds_rsub_u64">; -defm DS_INC_RTN_U64 : DS_1A1D_RET <0x63, "ds_inc_rtn_u64", VReg_64, "ds_inc_u64">; -defm DS_DEC_RTN_U64 : DS_1A1D_RET <0x64, "ds_dec_rtn_u64", VReg_64, "ds_dec_u64">; -defm DS_MIN_RTN_I64 : DS_1A1D_RET <0x65, "ds_min_rtn_i64", VReg_64, "ds_min_i64">; -defm DS_MAX_RTN_I64 : DS_1A1D_RET <0x66, "ds_max_rtn_i64", VReg_64, "ds_max_i64">; -defm DS_MIN_RTN_U64 : DS_1A1D_RET <0x67, "ds_min_rtn_u64", VReg_64, "ds_min_u64">; -defm DS_MAX_RTN_U64 : DS_1A1D_RET <0x68, "ds_max_rtn_u64", VReg_64, "ds_max_u64">; -defm DS_AND_RTN_B64 : DS_1A1D_RET <0x69, "ds_and_rtn_b64", VReg_64, "ds_and_b64">; -defm DS_OR_RTN_B64 : DS_1A1D_RET <0x6a, "ds_or_rtn_b64", VReg_64, "ds_or_b64">; -defm DS_XOR_RTN_B64 : DS_1A1D_RET <0x6b, "ds_xor_rtn_b64", VReg_64, "ds_xor_b64">; -defm DS_MSKOR_RTN_B64 : DS_1A2D_RET <0x6c, "ds_mskor_rtn_b64", VReg_64, "ds_mskor_b64">; -defm DS_WRXCHG_RTN_B64 : DS_1A1D_RET <0x6d, "ds_wrxchg_rtn_b64", VReg_64, "ds_wrxchg_b64">; -defm DS_WRXCHG2_RTN_B64 : DS_1A2D_RET <0x6e, "ds_wrxchg2_rtn_b64", VReg_128, "ds_wrxchg2_b64", VReg_64>; -defm DS_WRXCHG2ST64_RTN_B64 : DS_1A2D_RET <0x6f, "ds_wrxchg2st64_rtn_b64", VReg_128, "ds_wrxchg2st64_b64", VReg_64>; -defm DS_CMPST_RTN_B64 : DS_1A2D_RET <0x70, "ds_cmpst_rtn_b64", VReg_64, "ds_cmpst_b64">; -defm DS_CMPST_RTN_F64 : DS_1A2D_RET <0x71, "ds_cmpst_rtn_f64", VReg_64, "ds_cmpst_f64">; -defm DS_MIN_RTN_F64 : DS_1A1D_RET <0x72, "ds_min_rtn_f64", VReg_64, "ds_min_f64">; -defm DS_MAX_RTN_F64 : DS_1A1D_RET <0x73, "ds_max_rtn_f64", VReg_64, "ds_max_f64">; - -let mayStore = 0 in { -defm DS_READ_B64 : DS_1A_RET <0x76, "ds_read_b64", VReg_64>; -defm DS_READ2_B64 : DS_1A_Off8_RET <0x77, "ds_read2_b64", VReg_128>; -defm DS_READ2ST64_B64 : DS_1A_Off8_RET <0x78, "ds_read2st64_b64", VReg_128>; -} - -defm DS_ADD_SRC2_U32 : DS_1A <0x80, "ds_add_src2_u32">; -defm DS_SUB_SRC2_U32 : DS_1A <0x81, "ds_sub_src2_u32">; -defm DS_RSUB_SRC2_U32 : DS_1A <0x82, "ds_rsub_src2_u32">; -defm DS_INC_SRC2_U32 : DS_1A <0x83, "ds_inc_src2_u32">; -defm DS_DEC_SRC2_U32 : DS_1A <0x84, "ds_dec_src2_u32">; -defm DS_MIN_SRC2_I32 : DS_1A <0x85, "ds_min_src2_i32">; -defm DS_MAX_SRC2_I32 : DS_1A <0x86, "ds_max_src2_i32">; -defm DS_MIN_SRC2_U32 : DS_1A <0x87, "ds_min_src2_u32">; -defm DS_MAX_SRC2_U32 : DS_1A <0x88, "ds_max_src2_u32">; -defm DS_AND_SRC2_B32 : DS_1A <0x89, "ds_and_src_b32">; -defm DS_OR_SRC2_B32 : DS_1A <0x8a, "ds_or_src2_b32">; -defm DS_XOR_SRC2_B32 : DS_1A <0x8b, "ds_xor_src2_b32">; -defm DS_WRITE_SRC2_B32 : DS_1A <0x8c, "ds_write_src2_b32">; - -defm DS_MIN_SRC2_F32 : DS_1A <0x92, "ds_min_src2_f32">; -defm DS_MAX_SRC2_F32 : DS_1A <0x93, "ds_max_src2_f32">; - -defm DS_ADD_SRC2_U64 : DS_1A <0xc0, "ds_add_src2_u64">; -defm DS_SUB_SRC2_U64 : DS_1A <0xc1, "ds_sub_src2_u64">; -defm DS_RSUB_SRC2_U64 : DS_1A <0xc2, "ds_rsub_src2_u64">; -defm DS_INC_SRC2_U64 : DS_1A <0xc3, "ds_inc_src2_u64">; -defm DS_DEC_SRC2_U64 : DS_1A <0xc4, "ds_dec_src2_u64">; -defm DS_MIN_SRC2_I64 : DS_1A <0xc5, "ds_min_src2_i64">; -defm DS_MAX_SRC2_I64 : DS_1A <0xc6, "ds_max_src2_i64">; -defm DS_MIN_SRC2_U64 : DS_1A <0xc7, "ds_min_src2_u64">; -defm DS_MAX_SRC2_U64 : DS_1A <0xc8, "ds_max_src2_u64">; -defm DS_AND_SRC2_B64 : DS_1A <0xc9, "ds_and_src2_b64">; -defm DS_OR_SRC2_B64 : DS_1A <0xca, "ds_or_src2_b64">; -defm DS_XOR_SRC2_B64 : DS_1A <0xcb, "ds_xor_src2_b64">; -defm DS_WRITE_SRC2_B64 : DS_1A <0xcc, "ds_write_src2_b64">; - -defm DS_MIN_SRC2_F64 : DS_1A <0xd2, "ds_min_src2_f64">; -defm DS_MAX_SRC2_F64 : DS_1A <0xd3, "ds_max_src2_f64">; - -//let SubtargetPredicate = isCI in { -// DS_CONDXCHG32_RTN_B64 -// DS_CONDXCHG32_RTN_B128 -//} // End isCI - -//===----------------------------------------------------------------------===// -// MUBUF Instructions -//===----------------------------------------------------------------------===// - -defm BUFFER_LOAD_FORMAT_X : MUBUF_Load_Helper < - mubuf<0x00>, "buffer_load_format_x", VGPR_32 ->; -defm BUFFER_LOAD_FORMAT_XY : MUBUF_Load_Helper < - mubuf<0x01>, "buffer_load_format_xy", VReg_64 ->; -defm BUFFER_LOAD_FORMAT_XYZ : MUBUF_Load_Helper < - mubuf<0x02>, "buffer_load_format_xyz", VReg_96 ->; -defm BUFFER_LOAD_FORMAT_XYZW : MUBUF_Load_Helper < - mubuf<0x03>, "buffer_load_format_xyzw", VReg_128 ->; -defm BUFFER_STORE_FORMAT_X : MUBUF_Store_Helper < - mubuf<0x04>, "buffer_store_format_x", VGPR_32 ->; -defm BUFFER_STORE_FORMAT_XY : MUBUF_Store_Helper < - mubuf<0x05>, "buffer_store_format_xy", VReg_64 ->; -defm BUFFER_STORE_FORMAT_XYZ : MUBUF_Store_Helper < - mubuf<0x06>, "buffer_store_format_xyz", VReg_96 ->; -defm BUFFER_STORE_FORMAT_XYZW : MUBUF_Store_Helper < - mubuf<0x07>, "buffer_store_format_xyzw", VReg_128 ->; -defm BUFFER_LOAD_UBYTE : MUBUF_Load_Helper < - mubuf<0x08, 0x10>, "buffer_load_ubyte", VGPR_32, i32, az_extloadi8_global ->; -defm BUFFER_LOAD_SBYTE : MUBUF_Load_Helper < - mubuf<0x09, 0x11>, "buffer_load_sbyte", VGPR_32, i32, sextloadi8_global ->; -defm BUFFER_LOAD_USHORT : MUBUF_Load_Helper < - mubuf<0x0a, 0x12>, "buffer_load_ushort", VGPR_32, i32, az_extloadi16_global ->; -defm BUFFER_LOAD_SSHORT : MUBUF_Load_Helper < - mubuf<0x0b, 0x13>, "buffer_load_sshort", VGPR_32, i32, sextloadi16_global ->; -defm BUFFER_LOAD_DWORD : MUBUF_Load_Helper < - mubuf<0x0c, 0x14>, "buffer_load_dword", VGPR_32, i32, global_load ->; -defm BUFFER_LOAD_DWORDX2 : MUBUF_Load_Helper < - mubuf<0x0d, 0x15>, "buffer_load_dwordx2", VReg_64, v2i32, global_load ->; -defm BUFFER_LOAD_DWORDX4 : MUBUF_Load_Helper < - mubuf<0x0e, 0x17>, "buffer_load_dwordx4", VReg_128, v4i32, global_load ->; - -defm BUFFER_STORE_BYTE : MUBUF_Store_Helper < - mubuf<0x18>, "buffer_store_byte", VGPR_32, i32, truncstorei8_global ->; - -defm BUFFER_STORE_SHORT : MUBUF_Store_Helper < - mubuf<0x1a>, "buffer_store_short", VGPR_32, i32, truncstorei16_global ->; - -defm BUFFER_STORE_DWORD : MUBUF_Store_Helper < - mubuf<0x1c>, "buffer_store_dword", VGPR_32, i32, global_store ->; - -defm BUFFER_STORE_DWORDX2 : MUBUF_Store_Helper < - mubuf<0x1d>, "buffer_store_dwordx2", VReg_64, v2i32, global_store ->; - -defm BUFFER_STORE_DWORDX4 : MUBUF_Store_Helper < - mubuf<0x1e, 0x1f>, "buffer_store_dwordx4", VReg_128, v4i32, global_store ->; - -defm BUFFER_ATOMIC_SWAP : MUBUF_Atomic < - mubuf<0x30, 0x40>, "buffer_atomic_swap", VGPR_32, i32, atomic_swap_global ->; -//def BUFFER_ATOMIC_CMPSWAP : MUBUF_ , "buffer_atomic_cmpswap", []>; -defm BUFFER_ATOMIC_ADD : MUBUF_Atomic < - mubuf<0x32, 0x42>, "buffer_atomic_add", VGPR_32, i32, atomic_add_global ->; -defm BUFFER_ATOMIC_SUB : MUBUF_Atomic < - mubuf<0x33, 0x43>, "buffer_atomic_sub", VGPR_32, i32, atomic_sub_global ->; -//def BUFFER_ATOMIC_RSUB : MUBUF_ , "buffer_atomic_rsub", []>; // isn't on CI & VI -defm BUFFER_ATOMIC_SMIN : MUBUF_Atomic < - mubuf<0x35, 0x44>, "buffer_atomic_smin", VGPR_32, i32, atomic_min_global ->; -defm BUFFER_ATOMIC_UMIN : MUBUF_Atomic < - mubuf<0x36, 0x45>, "buffer_atomic_umin", VGPR_32, i32, atomic_umin_global ->; -defm BUFFER_ATOMIC_SMAX : MUBUF_Atomic < - mubuf<0x37, 0x46>, "buffer_atomic_smax", VGPR_32, i32, atomic_max_global ->; -defm BUFFER_ATOMIC_UMAX : MUBUF_Atomic < - mubuf<0x38, 0x47>, "buffer_atomic_umax", VGPR_32, i32, atomic_umax_global ->; -defm BUFFER_ATOMIC_AND : MUBUF_Atomic < - mubuf<0x39, 0x48>, "buffer_atomic_and", VGPR_32, i32, atomic_and_global ->; -defm BUFFER_ATOMIC_OR : MUBUF_Atomic < - mubuf<0x3a, 0x49>, "buffer_atomic_or", VGPR_32, i32, atomic_or_global ->; -defm BUFFER_ATOMIC_XOR : MUBUF_Atomic < - mubuf<0x3b, 0x4a>, "buffer_atomic_xor", VGPR_32, i32, atomic_xor_global ->; -//def BUFFER_ATOMIC_INC : MUBUF_ , "buffer_atomic_inc", []>; -//def BUFFER_ATOMIC_DEC : MUBUF_ , "buffer_atomic_dec", []>; -//def BUFFER_ATOMIC_FCMPSWAP : MUBUF_ , "buffer_atomic_fcmpswap", []>; // isn't on VI -//def BUFFER_ATOMIC_FMIN : MUBUF_ , "buffer_atomic_fmin", []>; // isn't on VI -//def BUFFER_ATOMIC_FMAX : MUBUF_ , "buffer_atomic_fmax", []>; // isn't on VI -//def BUFFER_ATOMIC_SWAP_X2 : MUBUF_X2 , "buffer_atomic_swap_x2", []>; -//def BUFFER_ATOMIC_CMPSWAP_X2 : MUBUF_X2 , "buffer_atomic_cmpswap_x2", []>; -//def BUFFER_ATOMIC_ADD_X2 : MUBUF_X2 , "buffer_atomic_add_x2", []>; -//def BUFFER_ATOMIC_SUB_X2 : MUBUF_X2 , "buffer_atomic_sub_x2", []>; -//def BUFFER_ATOMIC_RSUB_X2 : MUBUF_X2 , "buffer_atomic_rsub_x2", []>; // isn't on CI & VI -//def BUFFER_ATOMIC_SMIN_X2 : MUBUF_X2 , "buffer_atomic_smin_x2", []>; -//def BUFFER_ATOMIC_UMIN_X2 : MUBUF_X2 , "buffer_atomic_umin_x2", []>; -//def BUFFER_ATOMIC_SMAX_X2 : MUBUF_X2 , "buffer_atomic_smax_x2", []>; -//def BUFFER_ATOMIC_UMAX_X2 : MUBUF_X2 , "buffer_atomic_umax_x2", []>; -//def BUFFER_ATOMIC_AND_X2 : MUBUF_X2 , "buffer_atomic_and_x2", []>; -//def BUFFER_ATOMIC_OR_X2 : MUBUF_X2 , "buffer_atomic_or_x2", []>; -//def BUFFER_ATOMIC_XOR_X2 : MUBUF_X2 , "buffer_atomic_xor_x2", []>; -//def BUFFER_ATOMIC_INC_X2 : MUBUF_X2 , "buffer_atomic_inc_x2", []>; -//def BUFFER_ATOMIC_DEC_X2 : MUBUF_X2 , "buffer_atomic_dec_x2", []>; -//def BUFFER_ATOMIC_FCMPSWAP_X2 : MUBUF_X2 , "buffer_atomic_fcmpswap_x2", []>; // isn't on VI -//def BUFFER_ATOMIC_FMIN_X2 : MUBUF_X2 , "buffer_atomic_fmin_x2", []>; // isn't on VI -//def BUFFER_ATOMIC_FMAX_X2 : MUBUF_X2 , "buffer_atomic_fmax_x2", []>; // isn't on VI -//def BUFFER_WBINVL1_SC : MUBUF_WBINVL1 , "buffer_wbinvl1_sc", []>; // isn't on CI & VI -//def BUFFER_WBINVL1_VOL : MUBUF_WBINVL1 , "buffer_wbinvl1_vol", []>; // isn't on SI -//def BUFFER_WBINVL1 : MUBUF_WBINVL1 , "buffer_wbinvl1", []>; - -//===----------------------------------------------------------------------===// -// MTBUF Instructions -//===----------------------------------------------------------------------===// - -//def TBUFFER_LOAD_FORMAT_X : MTBUF_ <0x00000000, "tbuffer_load_format_x", []>; -//def TBUFFER_LOAD_FORMAT_XY : MTBUF_ <0x00000001, "tbuffer_load_format_xy", []>; -//def TBUFFER_LOAD_FORMAT_XYZ : MTBUF_ <0x00000002, "tbuffer_load_format_xyz", []>; -defm TBUFFER_LOAD_FORMAT_XYZW : MTBUF_Load_Helper <0x00000003, "tbuffer_load_format_xyzw", VReg_128>; -defm TBUFFER_STORE_FORMAT_X : MTBUF_Store_Helper <0x00000004, "tbuffer_store_format_x", VGPR_32>; -defm TBUFFER_STORE_FORMAT_XY : MTBUF_Store_Helper <0x00000005, "tbuffer_store_format_xy", VReg_64>; -defm TBUFFER_STORE_FORMAT_XYZ : MTBUF_Store_Helper <0x00000006, "tbuffer_store_format_xyz", VReg_128>; -defm TBUFFER_STORE_FORMAT_XYZW : MTBUF_Store_Helper <0x00000007, "tbuffer_store_format_xyzw", VReg_128>; - -//===----------------------------------------------------------------------===// -// MIMG Instructions -//===----------------------------------------------------------------------===// - -defm IMAGE_LOAD : MIMG_NoSampler <0x00000000, "image_load">; -defm IMAGE_LOAD_MIP : MIMG_NoSampler <0x00000001, "image_load_mip">; -//def IMAGE_LOAD_PCK : MIMG_NoPattern_ <"image_load_pck", 0x00000002>; -//def IMAGE_LOAD_PCK_SGN : MIMG_NoPattern_ <"image_load_pck_sgn", 0x00000003>; -//def IMAGE_LOAD_MIP_PCK : MIMG_NoPattern_ <"image_load_mip_pck", 0x00000004>; -//def IMAGE_LOAD_MIP_PCK_SGN : MIMG_NoPattern_ <"image_load_mip_pck_sgn", 0x00000005>; -//def IMAGE_STORE : MIMG_NoPattern_ <"image_store", 0x00000008>; -//def IMAGE_STORE_MIP : MIMG_NoPattern_ <"image_store_mip", 0x00000009>; -//def IMAGE_STORE_PCK : MIMG_NoPattern_ <"image_store_pck", 0x0000000a>; -//def IMAGE_STORE_MIP_PCK : MIMG_NoPattern_ <"image_store_mip_pck", 0x0000000b>; -defm IMAGE_GET_RESINFO : MIMG_NoSampler <0x0000000e, "image_get_resinfo">; -//def IMAGE_ATOMIC_SWAP : MIMG_NoPattern_ <"image_atomic_swap", 0x0000000f>; -//def IMAGE_ATOMIC_CMPSWAP : MIMG_NoPattern_ <"image_atomic_cmpswap", 0x00000010>; -//def IMAGE_ATOMIC_ADD : MIMG_NoPattern_ <"image_atomic_add", 0x00000011>; -//def IMAGE_ATOMIC_SUB : MIMG_NoPattern_ <"image_atomic_sub", 0x00000012>; -//def IMAGE_ATOMIC_RSUB : MIMG_NoPattern_ <"image_atomic_rsub", 0x00000013>; -//def IMAGE_ATOMIC_SMIN : MIMG_NoPattern_ <"image_atomic_smin", 0x00000014>; -//def IMAGE_ATOMIC_UMIN : MIMG_NoPattern_ <"image_atomic_umin", 0x00000015>; -//def IMAGE_ATOMIC_SMAX : MIMG_NoPattern_ <"image_atomic_smax", 0x00000016>; -//def IMAGE_ATOMIC_UMAX : MIMG_NoPattern_ <"image_atomic_umax", 0x00000017>; -//def IMAGE_ATOMIC_AND : MIMG_NoPattern_ <"image_atomic_and", 0x00000018>; -//def IMAGE_ATOMIC_OR : MIMG_NoPattern_ <"image_atomic_or", 0x00000019>; -//def IMAGE_ATOMIC_XOR : MIMG_NoPattern_ <"image_atomic_xor", 0x0000001a>; -//def IMAGE_ATOMIC_INC : MIMG_NoPattern_ <"image_atomic_inc", 0x0000001b>; -//def IMAGE_ATOMIC_DEC : MIMG_NoPattern_ <"image_atomic_dec", 0x0000001c>; -//def IMAGE_ATOMIC_FCMPSWAP : MIMG_NoPattern_ <"image_atomic_fcmpswap", 0x0000001d>; -//def IMAGE_ATOMIC_FMIN : MIMG_NoPattern_ <"image_atomic_fmin", 0x0000001e>; -//def IMAGE_ATOMIC_FMAX : MIMG_NoPattern_ <"image_atomic_fmax", 0x0000001f>; -defm IMAGE_SAMPLE : MIMG_Sampler_WQM <0x00000020, "image_sample">; -defm IMAGE_SAMPLE_CL : MIMG_Sampler_WQM <0x00000021, "image_sample_cl">; -defm IMAGE_SAMPLE_D : MIMG_Sampler <0x00000022, "image_sample_d">; -defm IMAGE_SAMPLE_D_CL : MIMG_Sampler <0x00000023, "image_sample_d_cl">; -defm IMAGE_SAMPLE_L : MIMG_Sampler <0x00000024, "image_sample_l">; -defm IMAGE_SAMPLE_B : MIMG_Sampler_WQM <0x00000025, "image_sample_b">; -defm IMAGE_SAMPLE_B_CL : MIMG_Sampler_WQM <0x00000026, "image_sample_b_cl">; -defm IMAGE_SAMPLE_LZ : MIMG_Sampler <0x00000027, "image_sample_lz">; -defm IMAGE_SAMPLE_C : MIMG_Sampler_WQM <0x00000028, "image_sample_c">; -defm IMAGE_SAMPLE_C_CL : MIMG_Sampler_WQM <0x00000029, "image_sample_c_cl">; -defm IMAGE_SAMPLE_C_D : MIMG_Sampler <0x0000002a, "image_sample_c_d">; -defm IMAGE_SAMPLE_C_D_CL : MIMG_Sampler <0x0000002b, "image_sample_c_d_cl">; -defm IMAGE_SAMPLE_C_L : MIMG_Sampler <0x0000002c, "image_sample_c_l">; -defm IMAGE_SAMPLE_C_B : MIMG_Sampler_WQM <0x0000002d, "image_sample_c_b">; -defm IMAGE_SAMPLE_C_B_CL : MIMG_Sampler_WQM <0x0000002e, "image_sample_c_b_cl">; -defm IMAGE_SAMPLE_C_LZ : MIMG_Sampler <0x0000002f, "image_sample_c_lz">; -defm IMAGE_SAMPLE_O : MIMG_Sampler_WQM <0x00000030, "image_sample_o">; -defm IMAGE_SAMPLE_CL_O : MIMG_Sampler_WQM <0x00000031, "image_sample_cl_o">; -defm IMAGE_SAMPLE_D_O : MIMG_Sampler <0x00000032, "image_sample_d_o">; -defm IMAGE_SAMPLE_D_CL_O : MIMG_Sampler <0x00000033, "image_sample_d_cl_o">; -defm IMAGE_SAMPLE_L_O : MIMG_Sampler <0x00000034, "image_sample_l_o">; -defm IMAGE_SAMPLE_B_O : MIMG_Sampler_WQM <0x00000035, "image_sample_b_o">; -defm IMAGE_SAMPLE_B_CL_O : MIMG_Sampler_WQM <0x00000036, "image_sample_b_cl_o">; -defm IMAGE_SAMPLE_LZ_O : MIMG_Sampler <0x00000037, "image_sample_lz_o">; -defm IMAGE_SAMPLE_C_O : MIMG_Sampler_WQM <0x00000038, "image_sample_c_o">; -defm IMAGE_SAMPLE_C_CL_O : MIMG_Sampler_WQM <0x00000039, "image_sample_c_cl_o">; -defm IMAGE_SAMPLE_C_D_O : MIMG_Sampler <0x0000003a, "image_sample_c_d_o">; -defm IMAGE_SAMPLE_C_D_CL_O : MIMG_Sampler <0x0000003b, "image_sample_c_d_cl_o">; -defm IMAGE_SAMPLE_C_L_O : MIMG_Sampler <0x0000003c, "image_sample_c_l_o">; -defm IMAGE_SAMPLE_C_B_O : MIMG_Sampler_WQM <0x0000003d, "image_sample_c_b_o">; -defm IMAGE_SAMPLE_C_B_CL_O : MIMG_Sampler_WQM <0x0000003e, "image_sample_c_b_cl_o">; -defm IMAGE_SAMPLE_C_LZ_O : MIMG_Sampler <0x0000003f, "image_sample_c_lz_o">; -defm IMAGE_GATHER4 : MIMG_Gather_WQM <0x00000040, "image_gather4">; -defm IMAGE_GATHER4_CL : MIMG_Gather_WQM <0x00000041, "image_gather4_cl">; -defm IMAGE_GATHER4_L : MIMG_Gather <0x00000044, "image_gather4_l">; -defm IMAGE_GATHER4_B : MIMG_Gather_WQM <0x00000045, "image_gather4_b">; -defm IMAGE_GATHER4_B_CL : MIMG_Gather_WQM <0x00000046, "image_gather4_b_cl">; -defm IMAGE_GATHER4_LZ : MIMG_Gather <0x00000047, "image_gather4_lz">; -defm IMAGE_GATHER4_C : MIMG_Gather_WQM <0x00000048, "image_gather4_c">; -defm IMAGE_GATHER4_C_CL : MIMG_Gather_WQM <0x00000049, "image_gather4_c_cl">; -defm IMAGE_GATHER4_C_L : MIMG_Gather <0x0000004c, "image_gather4_c_l">; -defm IMAGE_GATHER4_C_B : MIMG_Gather_WQM <0x0000004d, "image_gather4_c_b">; -defm IMAGE_GATHER4_C_B_CL : MIMG_Gather_WQM <0x0000004e, "image_gather4_c_b_cl">; -defm IMAGE_GATHER4_C_LZ : MIMG_Gather <0x0000004f, "image_gather4_c_lz">; -defm IMAGE_GATHER4_O : MIMG_Gather_WQM <0x00000050, "image_gather4_o">; -defm IMAGE_GATHER4_CL_O : MIMG_Gather_WQM <0x00000051, "image_gather4_cl_o">; -defm IMAGE_GATHER4_L_O : MIMG_Gather <0x00000054, "image_gather4_l_o">; -defm IMAGE_GATHER4_B_O : MIMG_Gather_WQM <0x00000055, "image_gather4_b_o">; -defm IMAGE_GATHER4_B_CL_O : MIMG_Gather <0x00000056, "image_gather4_b_cl_o">; -defm IMAGE_GATHER4_LZ_O : MIMG_Gather <0x00000057, "image_gather4_lz_o">; -defm IMAGE_GATHER4_C_O : MIMG_Gather_WQM <0x00000058, "image_gather4_c_o">; -defm IMAGE_GATHER4_C_CL_O : MIMG_Gather_WQM <0x00000059, "image_gather4_c_cl_o">; -defm IMAGE_GATHER4_C_L_O : MIMG_Gather <0x0000005c, "image_gather4_c_l_o">; -defm IMAGE_GATHER4_C_B_O : MIMG_Gather_WQM <0x0000005d, "image_gather4_c_b_o">; -defm IMAGE_GATHER4_C_B_CL_O : MIMG_Gather_WQM <0x0000005e, "image_gather4_c_b_cl_o">; -defm IMAGE_GATHER4_C_LZ_O : MIMG_Gather <0x0000005f, "image_gather4_c_lz_o">; -defm IMAGE_GET_LOD : MIMG_Sampler_WQM <0x00000060, "image_get_lod">; -defm IMAGE_SAMPLE_CD : MIMG_Sampler <0x00000068, "image_sample_cd">; -defm IMAGE_SAMPLE_CD_CL : MIMG_Sampler <0x00000069, "image_sample_cd_cl">; -defm IMAGE_SAMPLE_C_CD : MIMG_Sampler <0x0000006a, "image_sample_c_cd">; -defm IMAGE_SAMPLE_C_CD_CL : MIMG_Sampler <0x0000006b, "image_sample_c_cd_cl">; -defm IMAGE_SAMPLE_CD_O : MIMG_Sampler <0x0000006c, "image_sample_cd_o">; -defm IMAGE_SAMPLE_CD_CL_O : MIMG_Sampler <0x0000006d, "image_sample_cd_cl_o">; -defm IMAGE_SAMPLE_C_CD_O : MIMG_Sampler <0x0000006e, "image_sample_c_cd_o">; -defm IMAGE_SAMPLE_C_CD_CL_O : MIMG_Sampler <0x0000006f, "image_sample_c_cd_cl_o">; -//def IMAGE_RSRC256 : MIMG_NoPattern_RSRC256 <"image_rsrc256", 0x0000007e>; -//def IMAGE_SAMPLER : MIMG_NoPattern_ <"image_sampler", 0x0000007f>; - -//===----------------------------------------------------------------------===// -// Flat Instructions -//===----------------------------------------------------------------------===// - -let Predicates = [HasFlatAddressSpace] in { -def FLAT_LOAD_UBYTE : FLAT_Load_Helper <0x00000008, "flat_load_ubyte", VGPR_32>; -def FLAT_LOAD_SBYTE : FLAT_Load_Helper <0x00000009, "flat_load_sbyte", VGPR_32>; -def FLAT_LOAD_USHORT : FLAT_Load_Helper <0x0000000a, "flat_load_ushort", VGPR_32>; -def FLAT_LOAD_SSHORT : FLAT_Load_Helper <0x0000000b, "flat_load_sshort", VGPR_32>; -def FLAT_LOAD_DWORD : FLAT_Load_Helper <0x0000000c, "flat_load_dword", VGPR_32>; -def FLAT_LOAD_DWORDX2 : FLAT_Load_Helper <0x0000000d, "flat_load_dwordx2", VReg_64>; -def FLAT_LOAD_DWORDX4 : FLAT_Load_Helper <0x0000000e, "flat_load_dwordx4", VReg_128>; -def FLAT_LOAD_DWORDX3 : FLAT_Load_Helper <0x00000010, "flat_load_dwordx3", VReg_96>; - -def FLAT_STORE_BYTE : FLAT_Store_Helper < - 0x00000018, "flat_store_byte", VGPR_32 ->; - -def FLAT_STORE_SHORT : FLAT_Store_Helper < - 0x0000001a, "flat_store_short", VGPR_32 ->; - -def FLAT_STORE_DWORD : FLAT_Store_Helper < - 0x0000001c, "flat_store_dword", VGPR_32 ->; - -def FLAT_STORE_DWORDX2 : FLAT_Store_Helper < - 0x0000001d, "flat_store_dwordx2", VReg_64 ->; - -def FLAT_STORE_DWORDX4 : FLAT_Store_Helper < - 0x0000001e, "flat_store_dwordx4", VReg_128 ->; - -def FLAT_STORE_DWORDX3 : FLAT_Store_Helper < - 0x0000001e, "flat_store_dwordx3", VReg_96 ->; - -//def FLAT_ATOMIC_SWAP : FLAT_ <0x00000030, "flat_atomic_swap", []>; -//def FLAT_ATOMIC_CMPSWAP : FLAT_ <0x00000031, "flat_atomic_cmpswap", []>; -//def FLAT_ATOMIC_ADD : FLAT_ <0x00000032, "flat_atomic_add", []>; -//def FLAT_ATOMIC_SUB : FLAT_ <0x00000033, "flat_atomic_sub", []>; -//def FLAT_ATOMIC_RSUB : FLAT_ <0x00000034, "flat_atomic_rsub", []>; -//def FLAT_ATOMIC_SMIN : FLAT_ <0x00000035, "flat_atomic_smin", []>; -//def FLAT_ATOMIC_UMIN : FLAT_ <0x00000036, "flat_atomic_umin", []>; -//def FLAT_ATOMIC_SMAX : FLAT_ <0x00000037, "flat_atomic_smax", []>; -//def FLAT_ATOMIC_UMAX : FLAT_ <0x00000038, "flat_atomic_umax", []>; -//def FLAT_ATOMIC_AND : FLAT_ <0x00000039, "flat_atomic_and", []>; -//def FLAT_ATOMIC_OR : FLAT_ <0x0000003a, "flat_atomic_or", []>; -//def FLAT_ATOMIC_XOR : FLAT_ <0x0000003b, "flat_atomic_xor", []>; -//def FLAT_ATOMIC_INC : FLAT_ <0x0000003c, "flat_atomic_inc", []>; -//def FLAT_ATOMIC_DEC : FLAT_ <0x0000003d, "flat_atomic_dec", []>; -//def FLAT_ATOMIC_FCMPSWAP : FLAT_ <0x0000003e, "flat_atomic_fcmpswap", []>; -//def FLAT_ATOMIC_FMIN : FLAT_ <0x0000003f, "flat_atomic_fmin", []>; -//def FLAT_ATOMIC_FMAX : FLAT_ <0x00000040, "flat_atomic_fmax", []>; -//def FLAT_ATOMIC_SWAP_X2 : FLAT_X2 <0x00000050, "flat_atomic_swap_x2", []>; -//def FLAT_ATOMIC_CMPSWAP_X2 : FLAT_X2 <0x00000051, "flat_atomic_cmpswap_x2", []>; -//def FLAT_ATOMIC_ADD_X2 : FLAT_X2 <0x00000052, "flat_atomic_add_x2", []>; -//def FLAT_ATOMIC_SUB_X2 : FLAT_X2 <0x00000053, "flat_atomic_sub_x2", []>; -//def FLAT_ATOMIC_RSUB_X2 : FLAT_X2 <0x00000054, "flat_atomic_rsub_x2", []>; -//def FLAT_ATOMIC_SMIN_X2 : FLAT_X2 <0x00000055, "flat_atomic_smin_x2", []>; -//def FLAT_ATOMIC_UMIN_X2 : FLAT_X2 <0x00000056, "flat_atomic_umin_x2", []>; -//def FLAT_ATOMIC_SMAX_X2 : FLAT_X2 <0x00000057, "flat_atomic_smax_x2", []>; -//def FLAT_ATOMIC_UMAX_X2 : FLAT_X2 <0x00000058, "flat_atomic_umax_x2", []>; -//def FLAT_ATOMIC_AND_X2 : FLAT_X2 <0x00000059, "flat_atomic_and_x2", []>; -//def FLAT_ATOMIC_OR_X2 : FLAT_X2 <0x0000005a, "flat_atomic_or_x2", []>; -//def FLAT_ATOMIC_XOR_X2 : FLAT_X2 <0x0000005b, "flat_atomic_xor_x2", []>; -//def FLAT_ATOMIC_INC_X2 : FLAT_X2 <0x0000005c, "flat_atomic_inc_x2", []>; -//def FLAT_ATOMIC_DEC_X2 : FLAT_X2 <0x0000005d, "flat_atomic_dec_x2", []>; -//def FLAT_ATOMIC_FCMPSWAP_X2 : FLAT_X2 <0x0000005e, "flat_atomic_fcmpswap_x2", []>; -//def FLAT_ATOMIC_FMIN_X2 : FLAT_X2 <0x0000005f, "flat_atomic_fmin_x2", []>; -//def FLAT_ATOMIC_FMAX_X2 : FLAT_X2 <0x00000060, "flat_atomic_fmax_x2", []>; - -} // End HasFlatAddressSpace predicate -//===----------------------------------------------------------------------===// -// VOP1 Instructions -//===----------------------------------------------------------------------===// - -let vdst = 0, src0 = 0 in { -defm V_NOP : VOP1_m , (outs), (ins), "v_nop", [], "v_nop">; -} - -let isMoveImm = 1, isReMaterializable = 1, isAsCheapAsAMove = 1 in { -defm V_MOV_B32 : VOP1Inst , "v_mov_b32", VOP_I32_I32>; -} // End isMoveImm = 1 - -let Uses = [EXEC] in { - -// FIXME: Specify SchedRW for READFIRSTLANE_B32 - -def V_READFIRSTLANE_B32 : VOP1 < - 0x00000002, - (outs SReg_32:$vdst), - (ins VGPR_32:$src0), - "v_readfirstlane_b32 $vdst, $src0", - [] ->; - -} - -let SchedRW = [WriteQuarterRate32] in { - -defm V_CVT_I32_F64 : VOP1Inst , "v_cvt_i32_f64", - VOP_I32_F64, fp_to_sint ->; -defm V_CVT_F64_I32 : VOP1Inst , "v_cvt_f64_i32", - VOP_F64_I32, sint_to_fp ->; -defm V_CVT_F32_I32 : VOP1Inst , "v_cvt_f32_i32", - VOP_F32_I32, sint_to_fp ->; -defm V_CVT_F32_U32 : VOP1Inst , "v_cvt_f32_u32", - VOP_F32_I32, uint_to_fp ->; -defm V_CVT_U32_F32 : VOP1Inst , "v_cvt_u32_f32", - VOP_I32_F32, fp_to_uint ->; -defm V_CVT_I32_F32 : VOP1Inst , "v_cvt_i32_f32", - VOP_I32_F32, fp_to_sint ->; -defm V_CVT_F16_F32 : VOP1Inst , "v_cvt_f16_f32", - VOP_I32_F32, fp_to_f16 ->; -defm V_CVT_F32_F16 : VOP1Inst , "v_cvt_f32_f16", - VOP_F32_I32, f16_to_fp ->; -defm V_CVT_RPI_I32_F32 : VOP1Inst , "v_cvt_rpi_i32_f32", - VOP_I32_F32, cvt_rpi_i32_f32>; -defm V_CVT_FLR_I32_F32 : VOP1Inst , "v_cvt_flr_i32_f32", - VOP_I32_F32, cvt_flr_i32_f32>; -defm V_CVT_OFF_F32_I4 : VOP1Inst , "v_cvt_off_f32_i4", VOP_F32_I32>; -defm V_CVT_F32_F64 : VOP1Inst , "v_cvt_f32_f64", - VOP_F32_F64, fround ->; -defm V_CVT_F64_F32 : VOP1Inst , "v_cvt_f64_f32", - VOP_F64_F32, fextend ->; -defm V_CVT_F32_UBYTE0 : VOP1Inst , "v_cvt_f32_ubyte0", - VOP_F32_I32, AMDGPUcvt_f32_ubyte0 ->; -defm V_CVT_F32_UBYTE1 : VOP1Inst , "v_cvt_f32_ubyte1", - VOP_F32_I32, AMDGPUcvt_f32_ubyte1 ->; -defm V_CVT_F32_UBYTE2 : VOP1Inst , "v_cvt_f32_ubyte2", - VOP_F32_I32, AMDGPUcvt_f32_ubyte2 ->; -defm V_CVT_F32_UBYTE3 : VOP1Inst , "v_cvt_f32_ubyte3", - VOP_F32_I32, AMDGPUcvt_f32_ubyte3 ->; -defm V_CVT_U32_F64 : VOP1Inst , "v_cvt_u32_f64", - VOP_I32_F64, fp_to_uint ->; -defm V_CVT_F64_U32 : VOP1Inst , "v_cvt_f64_u32", - VOP_F64_I32, uint_to_fp ->; - -} // let SchedRW = [WriteQuarterRate32] - -defm V_FRACT_F32 : VOP1Inst , "v_fract_f32", - VOP_F32_F32, AMDGPUfract ->; -defm V_TRUNC_F32 : VOP1Inst , "v_trunc_f32", - VOP_F32_F32, ftrunc ->; -defm V_CEIL_F32 : VOP1Inst , "v_ceil_f32", - VOP_F32_F32, fceil ->; -defm V_RNDNE_F32 : VOP1Inst , "v_rndne_f32", - VOP_F32_F32, frint ->; -defm V_FLOOR_F32 : VOP1Inst , "v_floor_f32", - VOP_F32_F32, ffloor ->; -defm V_EXP_F32 : VOP1Inst , "v_exp_f32", - VOP_F32_F32, fexp2 ->; - -let SchedRW = [WriteQuarterRate32] in { - -defm V_LOG_F32 : VOP1Inst , "v_log_f32", - VOP_F32_F32, flog2 ->; -defm V_RCP_F32 : VOP1Inst , "v_rcp_f32", - VOP_F32_F32, AMDGPUrcp ->; -defm V_RCP_IFLAG_F32 : VOP1Inst , "v_rcp_iflag_f32", - VOP_F32_F32 ->; -defm V_RSQ_F32 : VOP1Inst , "v_rsq_f32", - VOP_F32_F32, AMDGPUrsq ->; - -} //let SchedRW = [WriteQuarterRate32] - -let SchedRW = [WriteDouble] in { - -defm V_RCP_F64 : VOP1Inst , "v_rcp_f64", - VOP_F64_F64, AMDGPUrcp ->; -defm V_RSQ_F64 : VOP1Inst , "v_rsq_f64", - VOP_F64_F64, AMDGPUrsq ->; - -} // let SchedRW = [WriteDouble]; - -defm V_SQRT_F32 : VOP1Inst , "v_sqrt_f32", - VOP_F32_F32, fsqrt ->; - -let SchedRW = [WriteDouble] in { - -defm V_SQRT_F64 : VOP1Inst , "v_sqrt_f64", - VOP_F64_F64, fsqrt ->; - -} // let SchedRW = [WriteDouble] - -defm V_SIN_F32 : VOP1Inst , "v_sin_f32", - VOP_F32_F32, AMDGPUsin ->; -defm V_COS_F32 : VOP1Inst , "v_cos_f32", - VOP_F32_F32, AMDGPUcos ->; -defm V_NOT_B32 : VOP1Inst , "v_not_b32", VOP_I32_I32>; -defm V_BFREV_B32 : VOP1Inst , "v_bfrev_b32", VOP_I32_I32>; -defm V_FFBH_U32 : VOP1Inst , "v_ffbh_u32", VOP_I32_I32>; -defm V_FFBL_B32 : VOP1Inst , "v_ffbl_b32", VOP_I32_I32>; -defm V_FFBH_I32 : VOP1Inst , "v_ffbh_i32", VOP_I32_I32>; -defm V_FREXP_EXP_I32_F64 : VOP1Inst , "v_frexp_exp_i32_f64", - VOP_I32_F64 ->; -defm V_FREXP_MANT_F64 : VOP1Inst , "v_frexp_mant_f64", - VOP_F64_F64 ->; -defm V_FRACT_F64 : VOP1Inst , "v_fract_f64", VOP_F64_F64>; -defm V_FREXP_EXP_I32_F32 : VOP1Inst , "v_frexp_exp_i32_f32", - VOP_I32_F32 ->; -defm V_FREXP_MANT_F32 : VOP1Inst , "v_frexp_mant_f32", - VOP_F32_F32 ->; -let vdst = 0, src0 = 0 in { -defm V_CLREXCP : VOP1_m , (outs), (ins), "v_clrexcp", [], - "v_clrexcp" ->; -} -defm V_MOVRELD_B32 : VOP1Inst , "v_movreld_b32", VOP_I32_I32>; -defm V_MOVRELS_B32 : VOP1Inst , "v_movrels_b32", VOP_I32_I32>; -defm V_MOVRELSD_B32 : VOP1Inst , "v_movrelsd_b32", VOP_I32_I32>; - -// These instruction only exist on SI and CI -let SubtargetPredicate = isSICI in { - -let SchedRW = [WriteQuarterRate32] in { - -defm V_MOV_FED_B32 : VOP1InstSI , "v_mov_fed_b32", VOP_I32_I32>; -defm V_LOG_CLAMP_F32 : VOP1InstSI , "v_log_clamp_f32", VOP_F32_F32>; -defm V_RCP_CLAMP_F32 : VOP1InstSI , "v_rcp_clamp_f32", VOP_F32_F32>; -defm V_RCP_LEGACY_F32 : VOP1InstSI , "v_rcp_legacy_f32", VOP_F32_F32>; -defm V_RSQ_CLAMP_F32 : VOP1InstSI , "v_rsq_clamp_f32", - VOP_F32_F32, AMDGPUrsq_clamped ->; -defm V_RSQ_LEGACY_F32 : VOP1InstSI , "v_rsq_legacy_f32", - VOP_F32_F32, AMDGPUrsq_legacy ->; - -} // End let SchedRW = [WriteQuarterRate32] - -let SchedRW = [WriteDouble] in { - -defm V_RCP_CLAMP_F64 : VOP1InstSI , "v_rcp_clamp_f64", VOP_F64_F64>; -defm V_RSQ_CLAMP_F64 : VOP1InstSI , "v_rsq_clamp_f64", - VOP_F64_F64, AMDGPUrsq_clamped ->; - -} // End SchedRW = [WriteDouble] - -} // End SubtargetPredicate = isSICI - -//===----------------------------------------------------------------------===// -// VINTRP Instructions -//===----------------------------------------------------------------------===// - -let Uses = [M0] in { - -// FIXME: Specify SchedRW for VINTRP insturctions. - -multiclass V_INTERP_P1_F32_m : VINTRP_m < - 0x00000000, - (outs VGPR_32:$dst), - (ins VGPR_32:$i, i32imm:$attr_chan, i32imm:$attr), - "v_interp_p1_f32 $dst, $i, $attr_chan, $attr, [m0]", - [(set f32:$dst, (AMDGPUinterp_p1 i32:$i, (i32 imm:$attr_chan), - (i32 imm:$attr)))] ->; - -let OtherPredicates = [has32BankLDS] in { - -defm V_INTERP_P1_F32 : V_INTERP_P1_F32_m; - -} // End OtherPredicates = [has32BankLDS] - -let OtherPredicates = [has16BankLDS], Constraints = "@earlyclobber $dst" in { - -defm V_INTERP_P1_F32_16bank : V_INTERP_P1_F32_m; - -} // End OtherPredicates = [has32BankLDS], Constraints = "@earlyclobber $dst" - -let DisableEncoding = "$src0", Constraints = "$src0 = $dst" in { - -defm V_INTERP_P2_F32 : VINTRP_m < - 0x00000001, - (outs VGPR_32:$dst), - (ins VGPR_32:$src0, VGPR_32:$j, i32imm:$attr_chan, i32imm:$attr), - "v_interp_p2_f32 $dst, [$src0], $j, $attr_chan, $attr, [m0]", - [(set f32:$dst, (AMDGPUinterp_p2 f32:$src0, i32:$j, (i32 imm:$attr_chan), - (i32 imm:$attr)))]>; - -} // End DisableEncoding = "$src0", Constraints = "$src0 = $dst" - -defm V_INTERP_MOV_F32 : VINTRP_m < - 0x00000002, - (outs VGPR_32:$dst), - (ins InterpSlot:$src0, i32imm:$attr_chan, i32imm:$attr), - "v_interp_mov_f32 $dst, $src0, $attr_chan, $attr, [m0]", - [(set f32:$dst, (AMDGPUinterp_mov (i32 imm:$src0), (i32 imm:$attr_chan), - (i32 imm:$attr)))]>; - -} // End Uses = [M0] - -//===----------------------------------------------------------------------===// -// VOP2 Instructions -//===----------------------------------------------------------------------===// - -multiclass V_CNDMASK { - defm _e32 : VOP2_m < - op, VOP_CNDMASK.Outs, VOP_CNDMASK.Ins32, VOP_CNDMASK.Asm32, [], - name, name>; - - defm _e64 : VOP3_m < - op, VOP_CNDMASK.Outs, VOP_CNDMASK.Ins64, - name#!cast(VOP_CNDMASK.Asm64), [], name, 3>; -} - -defm V_CNDMASK_B32 : V_CNDMASK, "v_cndmask_b32">; - -let isCommutable = 1 in { -defm V_ADD_F32 : VOP2Inst , "v_add_f32", - VOP_F32_F32_F32, fadd ->; - -defm V_SUB_F32 : VOP2Inst , "v_sub_f32", VOP_F32_F32_F32, fsub>; -defm V_SUBREV_F32 : VOP2Inst , "v_subrev_f32", - VOP_F32_F32_F32, null_frag, "v_sub_f32" ->; -} // End isCommutable = 1 - -let isCommutable = 1 in { - -defm V_MUL_LEGACY_F32 : VOP2Inst , "v_mul_legacy_f32", - VOP_F32_F32_F32, int_AMDGPU_mul ->; - -defm V_MUL_F32 : VOP2Inst , "v_mul_f32", - VOP_F32_F32_F32, fmul ->; - -defm V_MUL_I32_I24 : VOP2Inst , "v_mul_i32_i24", - VOP_I32_I32_I32, AMDGPUmul_i24 ->; - -defm V_MUL_HI_I32_I24 : VOP2Inst , "v_mul_hi_i32_i24", - VOP_I32_I32_I32 ->; - -defm V_MUL_U32_U24 : VOP2Inst , "v_mul_u32_u24", - VOP_I32_I32_I32, AMDGPUmul_u24 ->; - -defm V_MUL_HI_U32_U24 : VOP2Inst , "v_mul_hi_u32_u24", - VOP_I32_I32_I32 ->; - -defm V_MIN_F32 : VOP2Inst , "v_min_f32", VOP_F32_F32_F32, - fminnum>; -defm V_MAX_F32 : VOP2Inst , "v_max_f32", VOP_F32_F32_F32, - fmaxnum>; -defm V_MIN_I32 : VOP2Inst , "v_min_i32", VOP_I32_I32_I32>; -defm V_MAX_I32 : VOP2Inst , "v_max_i32", VOP_I32_I32_I32>; -defm V_MIN_U32 : VOP2Inst , "v_min_u32", VOP_I32_I32_I32>; -defm V_MAX_U32 : VOP2Inst , "v_max_u32", VOP_I32_I32_I32>; - -defm V_LSHRREV_B32 : VOP2Inst < - vop2<0x16, 0x10>, "v_lshrrev_b32", VOP_I32_I32_I32, null_frag, - "v_lshr_b32" ->; - -defm V_ASHRREV_I32 : VOP2Inst < - vop2<0x18, 0x11>, "v_ashrrev_i32", VOP_I32_I32_I32, null_frag, - "v_ashr_i32" ->; - -defm V_LSHLREV_B32 : VOP2Inst < - vop2<0x1a, 0x12>, "v_lshlrev_b32", VOP_I32_I32_I32, null_frag, - "v_lshl_b32" ->; - -defm V_AND_B32 : VOP2Inst , "v_and_b32", VOP_I32_I32_I32>; -defm V_OR_B32 : VOP2Inst , "v_or_b32", VOP_I32_I32_I32>; -defm V_XOR_B32 : VOP2Inst , "v_xor_b32", VOP_I32_I32_I32>; - -defm V_MAC_F32 : VOP2Inst , "v_mac_f32", VOP_F32_F32_F32>; -} // End isCommutable = 1 - -defm V_MADMK_F32 : VOP2MADK , "v_madmk_f32">; - -let isCommutable = 1 in { -defm V_MADAK_F32 : VOP2MADK , "v_madak_f32">; -} // End isCommutable = 1 - -let isCommutable = 1, Defs = [VCC] in { // Carry-out goes to VCC -// No patterns so that the scalar instructions are always selected. -// The scalar versions will be replaced with vector when needed later. - -// V_ADD_I32, V_SUB_I32, and V_SUBREV_I32 where renamed to *_U32 in VI, -// but the VI instructions behave the same as the SI versions. -defm V_ADD_I32 : VOP2bInst , "v_add_i32", - VOP_I32_I32_I32, add ->; -defm V_SUB_I32 : VOP2bInst , "v_sub_i32", VOP_I32_I32_I32>; - -defm V_SUBREV_I32 : VOP2bInst , "v_subrev_i32", - VOP_I32_I32_I32, null_frag, "v_sub_i32" ->; - -let Uses = [VCC] in { // Carry-in comes from VCC -defm V_ADDC_U32 : VOP2bInst , "v_addc_u32", - VOP_I32_I32_I32_VCC ->; -defm V_SUBB_U32 : VOP2bInst , "v_subb_u32", - VOP_I32_I32_I32_VCC ->; -defm V_SUBBREV_U32 : VOP2bInst , "v_subbrev_u32", - VOP_I32_I32_I32_VCC, null_frag, "v_subb_u32" ->; - -} // End Uses = [VCC] -} // End isCommutable = 1, Defs = [VCC] - -defm V_READLANE_B32 : VOP2SI_3VI_m < - vop3 <0x001, 0x289>, - "v_readlane_b32", - (outs SReg_32:$vdst), - (ins VGPR_32:$src0, SCSrc_32:$src1), - "v_readlane_b32 $vdst, $src0, $src1" ->; - -defm V_WRITELANE_B32 : VOP2SI_3VI_m < - vop3 <0x002, 0x28a>, - "v_writelane_b32", - (outs VGPR_32:$vdst), - (ins SReg_32:$src0, SCSrc_32:$src1), - "v_writelane_b32 $vdst, $src0, $src1" ->; - -// These instructions only exist on SI and CI -let SubtargetPredicate = isSICI in { - -defm V_MIN_LEGACY_F32 : VOP2InstSI , "v_min_legacy_f32", - VOP_F32_F32_F32, AMDGPUfmin_legacy ->; -defm V_MAX_LEGACY_F32 : VOP2InstSI , "v_max_legacy_f32", - VOP_F32_F32_F32, AMDGPUfmax_legacy ->; - -let isCommutable = 1 in { -defm V_LSHR_B32 : VOP2InstSI , "v_lshr_b32", VOP_I32_I32_I32>; -defm V_ASHR_I32 : VOP2InstSI , "v_ashr_i32", VOP_I32_I32_I32>; -defm V_LSHL_B32 : VOP2InstSI , "v_lshl_b32", VOP_I32_I32_I32>; -} // End isCommutable = 1 -} // End let SubtargetPredicate = SICI - -let isCommutable = 1 in { -defm V_MAC_LEGACY_F32 : VOP2_VI3_Inst , "v_mac_legacy_f32", - VOP_F32_F32_F32 ->; -} // End isCommutable = 1 - -defm V_BFM_B32 : VOP2_VI3_Inst , "v_bfm_b32", - VOP_I32_I32_I32 ->; -defm V_BCNT_U32_B32 : VOP2_VI3_Inst , "v_bcnt_u32_b32", - VOP_I32_I32_I32 ->; -defm V_MBCNT_LO_U32_B32 : VOP2_VI3_Inst , "v_mbcnt_lo_u32_b32", - VOP_I32_I32_I32 ->; -defm V_MBCNT_HI_U32_B32 : VOP2_VI3_Inst , "v_mbcnt_hi_u32_b32", - VOP_I32_I32_I32 ->; -defm V_LDEXP_F32 : VOP2_VI3_Inst , "v_ldexp_f32", - VOP_F32_F32_I32, AMDGPUldexp ->; - -defm V_CVT_PKACCUM_U8_F32 : VOP2_VI3_Inst , "v_cvt_pkaccum_u8_f32", - VOP_I32_F32_I32>; // TODO: set "Uses = dst" - -defm V_CVT_PKNORM_I16_F32 : VOP2_VI3_Inst , "v_cvt_pknorm_i16_f32", - VOP_I32_F32_F32 ->; -defm V_CVT_PKNORM_U16_F32 : VOP2_VI3_Inst , "v_cvt_pknorm_u16_f32", - VOP_I32_F32_F32 ->; -defm V_CVT_PKRTZ_F16_F32 : VOP2_VI3_Inst , "v_cvt_pkrtz_f16_f32", - VOP_I32_F32_F32, int_SI_packf16 ->; -defm V_CVT_PK_U16_U32 : VOP2_VI3_Inst , "v_cvt_pk_u16_u32", - VOP_I32_I32_I32 ->; -defm V_CVT_PK_I16_I32 : VOP2_VI3_Inst , "v_cvt_pk_i16_i32", - VOP_I32_I32_I32 ->; - -//===----------------------------------------------------------------------===// -// VOP3 Instructions -//===----------------------------------------------------------------------===// - -let isCommutable = 1 in { -defm V_MAD_LEGACY_F32 : VOP3Inst , "v_mad_legacy_f32", - VOP_F32_F32_F32_F32 ->; - -defm V_MAD_F32 : VOP3Inst , "v_mad_f32", - VOP_F32_F32_F32_F32, fmad ->; - -defm V_MAD_I32_I24 : VOP3Inst , "v_mad_i32_i24", - VOP_I32_I32_I32_I32, AMDGPUmad_i24 ->; -defm V_MAD_U32_U24 : VOP3Inst , "v_mad_u32_u24", - VOP_I32_I32_I32_I32, AMDGPUmad_u24 ->; -} // End isCommutable = 1 - -defm V_CUBEID_F32 : VOP3Inst , "v_cubeid_f32", - VOP_F32_F32_F32_F32 ->; -defm V_CUBESC_F32 : VOP3Inst , "v_cubesc_f32", - VOP_F32_F32_F32_F32 ->; -defm V_CUBETC_F32 : VOP3Inst , "v_cubetc_f32", - VOP_F32_F32_F32_F32 ->; -defm V_CUBEMA_F32 : VOP3Inst , "v_cubema_f32", - VOP_F32_F32_F32_F32 ->; - -defm V_BFE_U32 : VOP3Inst , "v_bfe_u32", - VOP_I32_I32_I32_I32, AMDGPUbfe_u32 ->; -defm V_BFE_I32 : VOP3Inst , "v_bfe_i32", - VOP_I32_I32_I32_I32, AMDGPUbfe_i32 ->; - -defm V_BFI_B32 : VOP3Inst , "v_bfi_b32", - VOP_I32_I32_I32_I32, AMDGPUbfi ->; - -let isCommutable = 1 in { -defm V_FMA_F32 : VOP3Inst , "v_fma_f32", - VOP_F32_F32_F32_F32, fma ->; -defm V_FMA_F64 : VOP3Inst , "v_fma_f64", - VOP_F64_F64_F64_F64, fma ->; -} // End isCommutable = 1 - -//def V_LERP_U8 : VOP3_U8 <0x0000014d, "v_lerp_u8", []>; -defm V_ALIGNBIT_B32 : VOP3Inst , "v_alignbit_b32", - VOP_I32_I32_I32_I32 ->; -defm V_ALIGNBYTE_B32 : VOP3Inst , "v_alignbyte_b32", - VOP_I32_I32_I32_I32 ->; - -defm V_MIN3_F32 : VOP3Inst , "v_min3_f32", - VOP_F32_F32_F32_F32, AMDGPUfmin3>; - -defm V_MIN3_I32 : VOP3Inst , "v_min3_i32", - VOP_I32_I32_I32_I32, AMDGPUsmin3 ->; -defm V_MIN3_U32 : VOP3Inst , "v_min3_u32", - VOP_I32_I32_I32_I32, AMDGPUumin3 ->; -defm V_MAX3_F32 : VOP3Inst , "v_max3_f32", - VOP_F32_F32_F32_F32, AMDGPUfmax3 ->; -defm V_MAX3_I32 : VOP3Inst , "v_max3_i32", - VOP_I32_I32_I32_I32, AMDGPUsmax3 ->; -defm V_MAX3_U32 : VOP3Inst , "v_max3_u32", - VOP_I32_I32_I32_I32, AMDGPUumax3 ->; -defm V_MED3_F32 : VOP3Inst , "v_med3_f32", - VOP_F32_F32_F32_F32 ->; -defm V_MED3_I32 : VOP3Inst , "v_med3_i32", - VOP_I32_I32_I32_I32 ->; -defm V_MED3_U32 : VOP3Inst , "v_med3_u32", - VOP_I32_I32_I32_I32 ->; - -//def V_SAD_U8 : VOP3_U8 <0x0000015a, "v_sad_u8", []>; -//def V_SAD_HI_U8 : VOP3_U8 <0x0000015b, "v_sad_hi_u8", []>; -//def V_SAD_U16 : VOP3_U16 <0x0000015c, "v_sad_u16", []>; -defm V_SAD_U32 : VOP3Inst , "v_sad_u32", - VOP_I32_I32_I32_I32 ->; -////def V_CVT_PK_U8_F32 : VOP3_U8 <0x0000015e, "v_cvt_pk_u8_f32", []>; -defm V_DIV_FIXUP_F32 : VOP3Inst < - vop3<0x15f, 0x1de>, "v_div_fixup_f32", VOP_F32_F32_F32_F32, AMDGPUdiv_fixup ->; - -let SchedRW = [WriteDouble] in { - -defm V_DIV_FIXUP_F64 : VOP3Inst < - vop3<0x160, 0x1df>, "v_div_fixup_f64", VOP_F64_F64_F64_F64, AMDGPUdiv_fixup ->; - -} // let SchedRW = [WriteDouble] - -let SchedRW = [WriteDouble] in { -let isCommutable = 1 in { - -defm V_ADD_F64 : VOP3Inst , "v_add_f64", - VOP_F64_F64_F64, fadd ->; -defm V_MUL_F64 : VOP3Inst , "v_mul_f64", - VOP_F64_F64_F64, fmul ->; - -defm V_MIN_F64 : VOP3Inst , "v_min_f64", - VOP_F64_F64_F64, fminnum ->; -defm V_MAX_F64 : VOP3Inst , "v_max_f64", - VOP_F64_F64_F64, fmaxnum ->; - -} // isCommutable = 1 - -defm V_LDEXP_F64 : VOP3Inst , "v_ldexp_f64", - VOP_F64_F64_I32, AMDGPUldexp ->; - -} // let SchedRW = [WriteDouble] - -let isCommutable = 1, SchedRW = [WriteQuarterRate32] in { - -defm V_MUL_LO_U32 : VOP3Inst , "v_mul_lo_u32", - VOP_I32_I32_I32 ->; -defm V_MUL_HI_U32 : VOP3Inst , "v_mul_hi_u32", - VOP_I32_I32_I32 ->; - -defm V_MUL_LO_I32 : VOP3Inst , "v_mul_lo_i32", - VOP_I32_I32_I32 ->; -defm V_MUL_HI_I32 : VOP3Inst , "v_mul_hi_i32", - VOP_I32_I32_I32 ->; - -} // isCommutable = 1, SchedRW = [WriteQuarterRate32] - -let SchedRW = [WriteFloatFMA, WriteSALU] in { -defm V_DIV_SCALE_F32 : VOP3b_32 , "v_div_scale_f32", []>; -} - -let SchedRW = [WriteDouble, WriteSALU] in { -// Double precision division pre-scale. -defm V_DIV_SCALE_F64 : VOP3b_64 , "v_div_scale_f64", []>; -} // let SchedRW = [WriteDouble] - -let isCommutable = 1, Uses = [VCC] in { - -// v_div_fmas_f32: -// result = src0 * src1 + src2 -// if (vcc) -// result *= 2^32 -// -defm V_DIV_FMAS_F32 : VOP3_VCC_Inst , "v_div_fmas_f32", - VOP_F32_F32_F32_F32, AMDGPUdiv_fmas ->; - -let SchedRW = [WriteDouble] in { -// v_div_fmas_f64: -// result = src0 * src1 + src2 -// if (vcc) -// result *= 2^64 -// -defm V_DIV_FMAS_F64 : VOP3_VCC_Inst , "v_div_fmas_f64", - VOP_F64_F64_F64_F64, AMDGPUdiv_fmas ->; - -} // End SchedRW = [WriteDouble] -} // End isCommutable = 1 - -//def V_MSAD_U8 : VOP3_U8 <0x00000171, "v_msad_u8", []>; -//def V_QSAD_U8 : VOP3_U8 <0x00000172, "v_qsad_u8", []>; -//def V_MQSAD_U8 : VOP3_U8 <0x00000173, "v_mqsad_u8", []>; - -let SchedRW = [WriteDouble] in { -defm V_TRIG_PREOP_F64 : VOP3Inst < - vop3<0x174, 0x292>, "v_trig_preop_f64", VOP_F64_F64_I32, AMDGPUtrig_preop ->; - -} // let SchedRW = [WriteDouble] - -// These instructions only exist on SI and CI -let SubtargetPredicate = isSICI in { - -defm V_LSHL_B64 : VOP3Inst , "v_lshl_b64", VOP_I64_I64_I32>; -defm V_LSHR_B64 : VOP3Inst , "v_lshr_b64", VOP_I64_I64_I32>; -defm V_ASHR_I64 : VOP3Inst , "v_ashr_i64", VOP_I64_I64_I32>; - -defm V_MULLIT_F32 : VOP3Inst , "v_mullit_f32", - VOP_F32_F32_F32_F32>; - -} // End SubtargetPredicate = isSICI - -let SubtargetPredicate = isVI in { - -defm V_LSHLREV_B64 : VOP3Inst , "v_lshlrev_b64", - VOP_I64_I32_I64 ->; -defm V_LSHRREV_B64 : VOP3Inst , "v_lshrrev_b64", - VOP_I64_I32_I64 ->; -defm V_ASHRREV_I64 : VOP3Inst , "v_ashrrev_i64", - VOP_I64_I32_I64 ->; - -} // End SubtargetPredicate = isVI - -//===----------------------------------------------------------------------===// -// Pseudo Instructions -//===----------------------------------------------------------------------===// -let isCodeGenOnly = 1, isPseudo = 1 in { - -// For use in patterns -def V_CNDMASK_B64_PSEUDO : VOP3Common <(outs VReg_64:$dst), - (ins VSrc_64:$src0, VSrc_64:$src1, SSrc_64:$src2), "", [] ->; - -let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in { -// 64-bit vector move instruction. This is mainly used by the SIFoldOperands -// pass to enable folding of inline immediates. -def V_MOV_B64_PSEUDO : InstSI <(outs VReg_64:$dst), (ins VSrc_64:$src0), "", []>; -} // end let hasSideEffects = 0, mayLoad = 0, mayStore = 0 - -let hasSideEffects = 1 in { -def SGPR_USE : InstSI <(outs),(ins), "", []>; -} - -// SI pseudo instructions. These are used by the CFG structurizer pass -// and should be lowered to ISA instructions prior to codegen. - -let mayLoad = 1, mayStore = 1, hasSideEffects = 1 in { -let Uses = [EXEC], Defs = [EXEC] in { - -let isBranch = 1, isTerminator = 1 in { - -def SI_IF: InstSI < - (outs SReg_64:$dst), - (ins SReg_64:$vcc, brtarget:$target), - "", - [(set i64:$dst, (int_SI_if i1:$vcc, bb:$target))] ->; - -def SI_ELSE : InstSI < - (outs SReg_64:$dst), - (ins SReg_64:$src, brtarget:$target), - "", - [(set i64:$dst, (int_SI_else i64:$src, bb:$target))] -> { - let Constraints = "$src = $dst"; -} - -def SI_LOOP : InstSI < - (outs), - (ins SReg_64:$saved, brtarget:$target), - "si_loop $saved, $target", - [(int_SI_loop i64:$saved, bb:$target)] ->; - -} // end isBranch = 1, isTerminator = 1 - -def SI_BREAK : InstSI < - (outs SReg_64:$dst), - (ins SReg_64:$src), - "si_else $dst, $src", - [(set i64:$dst, (int_SI_break i64:$src))] ->; - -def SI_IF_BREAK : InstSI < - (outs SReg_64:$dst), - (ins SReg_64:$vcc, SReg_64:$src), - "si_if_break $dst, $vcc, $src", - [(set i64:$dst, (int_SI_if_break i1:$vcc, i64:$src))] ->; - -def SI_ELSE_BREAK : InstSI < - (outs SReg_64:$dst), - (ins SReg_64:$src0, SReg_64:$src1), - "si_else_break $dst, $src0, $src1", - [(set i64:$dst, (int_SI_else_break i64:$src0, i64:$src1))] ->; - -def SI_END_CF : InstSI < - (outs), - (ins SReg_64:$saved), - "si_end_cf $saved", - [(int_SI_end_cf i64:$saved)] ->; - -} // End Uses = [EXEC], Defs = [EXEC] - -let Uses = [EXEC], Defs = [EXEC,VCC] in { -def SI_KILL : InstSI < - (outs), - (ins VSrc_32:$src), - "si_kill $src", - [(int_AMDGPU_kill f32:$src)] ->; -} // End Uses = [EXEC], Defs = [EXEC,VCC] - -} // end mayLoad = 1, mayStore = 1, hasSideEffects = 1 - -let Uses = [EXEC], Defs = [EXEC,VCC,M0] in { - -//defm SI_ : RegisterLoadStore ; - -let UseNamedOperandTable = 1 in { - -def SI_RegisterLoad : InstSI < - (outs VGPR_32:$dst, SReg_64:$temp), - (ins FRAMEri32:$addr, i32imm:$chan), - "", [] -> { - let isRegisterLoad = 1; - let mayLoad = 1; -} - -class SIRegStore : InstSI < - outs, - (ins VGPR_32:$val, FRAMEri32:$addr, i32imm:$chan), - "", [] -> { - let isRegisterStore = 1; - let mayStore = 1; -} - -let usesCustomInserter = 1 in { -def SI_RegisterStorePseudo : SIRegStore<(outs)>; -} // End usesCustomInserter = 1 -def SI_RegisterStore : SIRegStore<(outs SReg_64:$temp)>; - - -} // End UseNamedOperandTable = 1 - -def SI_INDIRECT_SRC : InstSI < - (outs VGPR_32:$dst, SReg_64:$temp), - (ins unknown:$src, VSrc_32:$idx, i32imm:$off), - "si_indirect_src $dst, $temp, $src, $idx, $off", - [] ->; - -class SI_INDIRECT_DST : InstSI < - (outs rc:$dst, SReg_64:$temp), - (ins unknown:$src, VSrc_32:$idx, i32imm:$off, VGPR_32:$val), - "si_indirect_dst $dst, $temp, $src, $idx, $off, $val", - [] -> { - let Constraints = "$src = $dst"; -} - -def SI_INDIRECT_DST_V1 : SI_INDIRECT_DST; -def SI_INDIRECT_DST_V2 : SI_INDIRECT_DST; -def SI_INDIRECT_DST_V4 : SI_INDIRECT_DST; -def SI_INDIRECT_DST_V8 : SI_INDIRECT_DST; -def SI_INDIRECT_DST_V16 : SI_INDIRECT_DST; - -} // Uses = [EXEC,VCC,M0], Defs = [EXEC,VCC,M0] - -multiclass SI_SPILL_SGPR { - - let UseNamedOperandTable = 1 in { - def _SAVE : InstSI < - (outs), - (ins sgpr_class:$src, i32imm:$frame_idx, SReg_128:$scratch_rsrc, - SReg_32:$scratch_offset), - "", [] - >; - - def _RESTORE : InstSI < - (outs sgpr_class:$dst), - (ins i32imm:$frame_idx, SReg_128:$scratch_rsrc, SReg_32:$scratch_offset), - "", [] - >; - } // End UseNamedOperandTable = 1 -} - -// It's unclear whether you can use M0 as the output of v_readlane_b32 -// instructions, so use SGPR_32 register class for spills to prevent -// this from happening. -defm SI_SPILL_S32 : SI_SPILL_SGPR ; -defm SI_SPILL_S64 : SI_SPILL_SGPR ; -defm SI_SPILL_S128 : SI_SPILL_SGPR ; -defm SI_SPILL_S256 : SI_SPILL_SGPR ; -defm SI_SPILL_S512 : SI_SPILL_SGPR ; - -multiclass SI_SPILL_VGPR { - let UseNamedOperandTable = 1, VGPRSpill = 1 in { - def _SAVE : InstSI < - (outs), - (ins vgpr_class:$src, i32imm:$frame_idx, SReg_128:$scratch_rsrc, - SReg_32:$scratch_offset), - "", [] - >; - - def _RESTORE : InstSI < - (outs vgpr_class:$dst), - (ins i32imm:$frame_idx, SReg_128:$scratch_rsrc, SReg_32:$scratch_offset), - "", [] - >; - } // End UseNamedOperandTable = 1, VGPRSpill = 1 -} - -defm SI_SPILL_V32 : SI_SPILL_VGPR ; -defm SI_SPILL_V64 : SI_SPILL_VGPR ; -defm SI_SPILL_V96 : SI_SPILL_VGPR ; -defm SI_SPILL_V128 : SI_SPILL_VGPR ; -defm SI_SPILL_V256 : SI_SPILL_VGPR ; -defm SI_SPILL_V512 : SI_SPILL_VGPR ; - -let Defs = [SCC] in { - -def SI_CONSTDATA_PTR : InstSI < - (outs SReg_64:$dst), - (ins), - "", [(set SReg_64:$dst, (i64 SIconstdata_ptr))] ->; - -} // End Defs = [SCC] - -} // end IsCodeGenOnly, isPseudo - -} // end SubtargetPredicate = isGCN - -let Predicates = [isGCN] in { - -def : Pat< - (int_AMDGPU_cndlt f32:$src0, f32:$src1, f32:$src2), - (V_CNDMASK_B32_e64 $src2, $src1, - (V_CMP_GT_F32_e64 SRCMODS.NONE, 0, SRCMODS.NONE, $src0, - DSTCLAMP.NONE, DSTOMOD.NONE)) ->; - -def : Pat < - (int_AMDGPU_kilp), - (SI_KILL 0xbf800000) ->; - -/* int_SI_vs_load_input */ -def : Pat< - (SIload_input v4i32:$tlst, imm:$attr_offset, i32:$buf_idx_vgpr), - (BUFFER_LOAD_FORMAT_XYZW_IDXEN $buf_idx_vgpr, $tlst, 0, imm:$attr_offset, 0, 0, 0) ->; - -/* int_SI_export */ -def : Pat < - (int_SI_export imm:$en, imm:$vm, imm:$done, imm:$tgt, imm:$compr, - f32:$src0, f32:$src1, f32:$src2, f32:$src3), - (EXP imm:$en, imm:$tgt, imm:$compr, imm:$done, imm:$vm, - $src0, $src1, $src2, $src3) ->; - -//===----------------------------------------------------------------------===// -// SMRD Patterns -//===----------------------------------------------------------------------===// - -multiclass SMRD_Pattern { - - // 1. SI-CI: Offset as 8bit DWORD immediate - def : Pat < - (constant_load (add i64:$sbase, (i64 IMM8bitDWORD:$offset))), - (vt (Instr_IMM $sbase, (as_dword_i32imm $offset))) - >; - - // 2. Offset loaded in an 32bit SGPR - def : Pat < - (constant_load (add i64:$sbase, (i64 IMM32bit:$offset))), - (vt (Instr_SGPR $sbase, (S_MOV_B32 (i32 (as_i32imm $offset))))) - >; - - // 3. No offset at all - def : Pat < - (constant_load i64:$sbase), - (vt (Instr_IMM $sbase, 0)) - >; -} - -multiclass SMRD_Pattern_vi { - - // 1. VI: Offset as 20bit immediate in bytes - def : Pat < - (constant_load (add i64:$sbase, (i64 IMM20bit:$offset))), - (vt (Instr_IMM $sbase, (as_i32imm $offset))) - >; - - // 2. Offset loaded in an 32bit SGPR - def : Pat < - (constant_load (add i64:$sbase, (i64 IMM32bit:$offset))), - (vt (Instr_SGPR $sbase, (S_MOV_B32 (i32 (as_i32imm $offset))))) - >; - - // 3. No offset at all - def : Pat < - (constant_load i64:$sbase), - (vt (Instr_IMM $sbase, 0)) - >; -} - -let Predicates = [isSICI] in { -defm : SMRD_Pattern ; -defm : SMRD_Pattern ; -defm : SMRD_Pattern ; -defm : SMRD_Pattern ; -defm : SMRD_Pattern ; -defm : SMRD_Pattern ; -defm : SMRD_Pattern ; -} // End Predicates = [isSICI] - -let Predicates = [isVI] in { -defm : SMRD_Pattern_vi ; -defm : SMRD_Pattern_vi ; -defm : SMRD_Pattern_vi ; -defm : SMRD_Pattern_vi ; -defm : SMRD_Pattern_vi ; -defm : SMRD_Pattern_vi ; -defm : SMRD_Pattern_vi ; -} // End Predicates = [isVI] - -let Predicates = [isSICI] in { - -// 1. Offset as 8bit DWORD immediate -def : Pat < - (SIload_constant v4i32:$sbase, IMM8bitDWORD:$offset), - (S_BUFFER_LOAD_DWORD_IMM $sbase, (as_dword_i32imm $offset)) ->; - -} // End Predicates = [isSICI] - -// 2. Offset loaded in an 32bit SGPR -def : Pat < - (SIload_constant v4i32:$sbase, imm:$offset), - (S_BUFFER_LOAD_DWORD_SGPR $sbase, (S_MOV_B32 imm:$offset)) ->; - -//===----------------------------------------------------------------------===// -// SOP1 Patterns -//===----------------------------------------------------------------------===// - -def : Pat < - (i64 (ctpop i64:$src)), - (i64 (REG_SEQUENCE SReg_64, - (S_BCNT1_I32_B64 $src), sub0, - (S_MOV_B32 0), sub1)) ->; - -//===----------------------------------------------------------------------===// -// SOP2 Patterns -//===----------------------------------------------------------------------===// - -// V_ADD_I32_e32/S_ADD_U32 produces carry in VCC/SCC. For the vector -// case, the sgpr-copies pass will fix this to use the vector version. -def : Pat < - (i32 (addc i32:$src0, i32:$src1)), - (S_ADD_U32 $src0, $src1) ->; - -//===----------------------------------------------------------------------===// -// SOPP Patterns -//===----------------------------------------------------------------------===// - -def : Pat < - (int_AMDGPU_barrier_global), - (S_BARRIER) ->; - -//===----------------------------------------------------------------------===// -// VOP1 Patterns -//===----------------------------------------------------------------------===// - -let Predicates = [UnsafeFPMath] in { - -//def : RcpPat; -//defm : RsqPat; -//defm : RsqPat; - -def : RsqPat; -def : RsqPat; -} - -//===----------------------------------------------------------------------===// -// VOP2 Patterns -//===----------------------------------------------------------------------===// - -def : Pat < - (i32 (add (i32 (ctpop i32:$popcnt)), i32:$val)), - (V_BCNT_U32_B32_e64 $popcnt, $val) ->; - -def : Pat < - (i32 (select i1:$src0, i32:$src1, i32:$src2)), - (V_CNDMASK_B32_e64 $src2, $src1, $src0) ->; - -/********** ======================= **********/ -/********** Image sampling patterns **********/ -/********** ======================= **********/ - -// Image + sampler -class SampleRawPattern : Pat < - (name vt:$addr, v8i32:$rsrc, v4i32:$sampler, i32:$dmask, i32:$unorm, - i32:$r128, i32:$da, i32:$glc, i32:$slc, i32:$tfe, i32:$lwe), - (opcode (as_i32imm $dmask), (as_i1imm $unorm), (as_i1imm $glc), (as_i1imm $da), - (as_i1imm $r128), (as_i1imm $tfe), (as_i1imm $lwe), (as_i1imm $slc), - $addr, $rsrc, $sampler) ->; - -multiclass SampleRawPatterns { - def : SampleRawPattern(opcode # _V4_V1), i32>; - def : SampleRawPattern(opcode # _V4_V2), v2i32>; - def : SampleRawPattern(opcode # _V4_V4), v4i32>; - def : SampleRawPattern(opcode # _V4_V8), v8i32>; - def : SampleRawPattern(opcode # _V4_V16), v16i32>; -} - -// Image only -class ImagePattern : Pat < - (name vt:$addr, v8i32:$rsrc, i32:$dmask, i32:$unorm, - i32:$r128, i32:$da, i32:$glc, i32:$slc, i32:$tfe, i32:$lwe), - (opcode (as_i32imm $dmask), (as_i1imm $unorm), (as_i1imm $glc), (as_i1imm $da), - (as_i1imm $r128), (as_i1imm $tfe), (as_i1imm $lwe), (as_i1imm $slc), - $addr, $rsrc) ->; - -multiclass ImagePatterns { - def : ImagePattern(opcode # _V4_V1), i32>; - def : ImagePattern(opcode # _V4_V2), v2i32>; - def : ImagePattern(opcode # _V4_V4), v4i32>; -} - -// Basic sample -defm : SampleRawPatterns; -defm : SampleRawPatterns; -defm : SampleRawPatterns; -defm : SampleRawPatterns; -defm : SampleRawPatterns; -defm : SampleRawPatterns; -defm : SampleRawPatterns; -defm : SampleRawPatterns; -defm : SampleRawPatterns; -defm : SampleRawPatterns; - -// Sample with comparison -defm : SampleRawPatterns; -defm : SampleRawPatterns; -defm : SampleRawPatterns; -defm : SampleRawPatterns; -defm : SampleRawPatterns; -defm : SampleRawPatterns; -defm : SampleRawPatterns; -defm : SampleRawPatterns; -defm : SampleRawPatterns; -defm : SampleRawPatterns; - -// Sample with offsets -defm : SampleRawPatterns; -defm : SampleRawPatterns; -defm : SampleRawPatterns; -defm : SampleRawPatterns; -defm : SampleRawPatterns; -defm : SampleRawPatterns; -defm : SampleRawPatterns; -defm : SampleRawPatterns; -defm : SampleRawPatterns; -defm : SampleRawPatterns; - -// Sample with comparison and offsets -defm : SampleRawPatterns; -defm : SampleRawPatterns; -defm : SampleRawPatterns; -defm : SampleRawPatterns; -defm : SampleRawPatterns; -defm : SampleRawPatterns; -defm : SampleRawPatterns; -defm : SampleRawPatterns; -defm : SampleRawPatterns; -defm : SampleRawPatterns; - -// Gather opcodes -// Only the variants which make sense are defined. -def : SampleRawPattern; -def : SampleRawPattern; -def : SampleRawPattern; -def : SampleRawPattern; -def : SampleRawPattern; -def : SampleRawPattern; -def : SampleRawPattern; -def : SampleRawPattern; -def : SampleRawPattern; - -def : SampleRawPattern; -def : SampleRawPattern; -def : SampleRawPattern; -def : SampleRawPattern; -def : SampleRawPattern; -def : SampleRawPattern; -def : SampleRawPattern; -def : SampleRawPattern; -def : SampleRawPattern; - -def : SampleRawPattern; -def : SampleRawPattern; -def : SampleRawPattern; -def : SampleRawPattern; -def : SampleRawPattern; -def : SampleRawPattern; -def : SampleRawPattern; -def : SampleRawPattern; -def : SampleRawPattern; - -def : SampleRawPattern; -def : SampleRawPattern; -def : SampleRawPattern; -def : SampleRawPattern; -def : SampleRawPattern; -def : SampleRawPattern; -def : SampleRawPattern; -def : SampleRawPattern; - -def : SampleRawPattern; -def : SampleRawPattern; -def : SampleRawPattern; - -def : ImagePattern; -defm : ImagePatterns; -defm : ImagePatterns; - -/* SIsample for simple 1D texture lookup */ -def : Pat < - (SIsample i32:$addr, v32i8:$rsrc, v4i32:$sampler, imm), - (IMAGE_SAMPLE_V4_V1 0xf, 0, 0, 0, 0, 0, 0, 0, $addr, $rsrc, $sampler) ->; - -class SamplePattern : Pat < - (name vt:$addr, v32i8:$rsrc, v4i32:$sampler, imm), - (opcode 0xf, 0, 0, 0, 0, 0, 0, 0, $addr, $rsrc, $sampler) ->; - -class SampleRectPattern : Pat < - (name vt:$addr, v32i8:$rsrc, v4i32:$sampler, TEX_RECT), - (opcode 0xf, 1, 0, 0, 0, 0, 0, 0, $addr, $rsrc, $sampler) ->; - -class SampleArrayPattern : Pat < - (name vt:$addr, v32i8:$rsrc, v4i32:$sampler, TEX_ARRAY), - (opcode 0xf, 0, 0, 1, 0, 0, 0, 0, $addr, $rsrc, $sampler) ->; - -class SampleShadowPattern : Pat < - (name vt:$addr, v32i8:$rsrc, v4i32:$sampler, TEX_SHADOW), - (opcode 0xf, 0, 0, 0, 0, 0, 0, 0, $addr, $rsrc, $sampler) ->; - -class SampleShadowArrayPattern : Pat < - (name vt:$addr, v32i8:$rsrc, v4i32:$sampler, TEX_SHADOW_ARRAY), - (opcode 0xf, 0, 0, 1, 0, 0, 0, 0, $addr, $rsrc, $sampler) ->; - -/* SIsample* for texture lookups consuming more address parameters */ -multiclass SamplePatterns { - def : SamplePattern ; - def : SampleRectPattern ; - def : SampleArrayPattern ; - def : SampleShadowPattern ; - def : SampleShadowArrayPattern ; - - def : SamplePattern ; - def : SampleArrayPattern ; - def : SampleShadowPattern ; - def : SampleShadowArrayPattern ; - - def : SamplePattern ; - def : SampleArrayPattern ; - def : SampleShadowPattern ; - def : SampleShadowArrayPattern ; - - def : SamplePattern ; - def : SampleArrayPattern ; - def : SampleShadowPattern ; - def : SampleShadowArrayPattern ; -} - -defm : SamplePatterns; -defm : SamplePatterns; -defm : SamplePatterns; -defm : SamplePatterns; - -/* int_SI_imageload for texture fetches consuming varying address parameters */ -class ImageLoadPattern : Pat < - (name addr_type:$addr, v32i8:$rsrc, imm), - (opcode 0xf, 0, 0, 0, 0, 0, 0, 0, $addr, $rsrc) ->; - -class ImageLoadArrayPattern : Pat < - (name addr_type:$addr, v32i8:$rsrc, TEX_ARRAY), - (opcode 0xf, 0, 0, 1, 0, 0, 0, 0, $addr, $rsrc) ->; - -class ImageLoadMSAAPattern : Pat < - (name addr_type:$addr, v32i8:$rsrc, TEX_MSAA), - (opcode 0xf, 0, 0, 0, 0, 0, 0, 0, $addr, $rsrc) ->; - -class ImageLoadArrayMSAAPattern : Pat < - (name addr_type:$addr, v32i8:$rsrc, TEX_ARRAY_MSAA), - (opcode 0xf, 0, 0, 1, 0, 0, 0, 0, $addr, $rsrc) ->; - -multiclass ImageLoadPatterns { - def : ImageLoadPattern ; - def : ImageLoadArrayPattern ; -} - -multiclass ImageLoadMSAAPatterns { - def : ImageLoadMSAAPattern ; - def : ImageLoadArrayMSAAPattern ; -} - -defm : ImageLoadPatterns; -defm : ImageLoadPatterns; - -defm : ImageLoadMSAAPatterns; -defm : ImageLoadMSAAPatterns; - -/* Image resource information */ -def : Pat < - (int_SI_resinfo i32:$mipid, v32i8:$rsrc, imm), - (IMAGE_GET_RESINFO_V4_V1 0xf, 0, 0, 0, 0, 0, 0, 0, (V_MOV_B32_e32 $mipid), $rsrc) ->; - -def : Pat < - (int_SI_resinfo i32:$mipid, v32i8:$rsrc, TEX_ARRAY), - (IMAGE_GET_RESINFO_V4_V1 0xf, 0, 0, 1, 0, 0, 0, 0, (V_MOV_B32_e32 $mipid), $rsrc) ->; - -def : Pat < - (int_SI_resinfo i32:$mipid, v32i8:$rsrc, TEX_ARRAY_MSAA), - (IMAGE_GET_RESINFO_V4_V1 0xf, 0, 0, 1, 0, 0, 0, 0, (V_MOV_B32_e32 $mipid), $rsrc) ->; - -/********** ============================================ **********/ -/********** Extraction, Insertion, Building and Casting **********/ -/********** ============================================ **********/ - -foreach Index = 0-2 in { - def Extract_Element_v2i32_#Index : Extract_Element < - i32, v2i32, Index, !cast(sub#Index) - >; - def Insert_Element_v2i32_#Index : Insert_Element < - i32, v2i32, Index, !cast(sub#Index) - >; - - def Extract_Element_v2f32_#Index : Extract_Element < - f32, v2f32, Index, !cast(sub#Index) - >; - def Insert_Element_v2f32_#Index : Insert_Element < - f32, v2f32, Index, !cast(sub#Index) - >; -} - -foreach Index = 0-3 in { - def Extract_Element_v4i32_#Index : Extract_Element < - i32, v4i32, Index, !cast(sub#Index) - >; - def Insert_Element_v4i32_#Index : Insert_Element < - i32, v4i32, Index, !cast(sub#Index) - >; - - def Extract_Element_v4f32_#Index : Extract_Element < - f32, v4f32, Index, !cast(sub#Index) - >; - def Insert_Element_v4f32_#Index : Insert_Element < - f32, v4f32, Index, !cast(sub#Index) - >; -} - -foreach Index = 0-7 in { - def Extract_Element_v8i32_#Index : Extract_Element < - i32, v8i32, Index, !cast(sub#Index) - >; - def Insert_Element_v8i32_#Index : Insert_Element < - i32, v8i32, Index, !cast(sub#Index) - >; - - def Extract_Element_v8f32_#Index : Extract_Element < - f32, v8f32, Index, !cast(sub#Index) - >; - def Insert_Element_v8f32_#Index : Insert_Element < - f32, v8f32, Index, !cast(sub#Index) - >; -} - -foreach Index = 0-15 in { - def Extract_Element_v16i32_#Index : Extract_Element < - i32, v16i32, Index, !cast(sub#Index) - >; - def Insert_Element_v16i32_#Index : Insert_Element < - i32, v16i32, Index, !cast(sub#Index) - >; - - def Extract_Element_v16f32_#Index : Extract_Element < - f32, v16f32, Index, !cast(sub#Index) - >; - def Insert_Element_v16f32_#Index : Insert_Element < - f32, v16f32, Index, !cast(sub#Index) - >; -} - -def : BitConvert ; -def : BitConvert ; - -def : BitConvert ; -def : BitConvert ; - -def : BitConvert ; - -def : BitConvert ; - -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; - -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; - -def : BitConvert ; -def : BitConvert ; - -/********** =================== **********/ -/********** Src & Dst modifiers **********/ -/********** =================== **********/ - -def : Pat < - (AMDGPUclamp (VOP3Mods0Clamp f32:$src0, i32:$src0_modifiers, i32:$omod), - (f32 FP_ZERO), (f32 FP_ONE)), - (V_ADD_F32_e64 $src0_modifiers, $src0, 0, 0, 1, $omod) ->; - -/********** ================================ **********/ -/********** Floating point absolute/negative **********/ -/********** ================================ **********/ - -// Prevent expanding both fneg and fabs. - -// FIXME: Should use S_OR_B32 -def : Pat < - (fneg (fabs f32:$src)), - (V_OR_B32_e32 $src, (V_MOV_B32_e32 0x80000000)) /* Set sign bit */ ->; - -// FIXME: Should use S_OR_B32 -def : Pat < - (fneg (fabs f64:$src)), - (REG_SEQUENCE VReg_64, - (i32 (EXTRACT_SUBREG f64:$src, sub0)), - sub0, - (V_OR_B32_e32 (EXTRACT_SUBREG f64:$src, sub1), - (V_MOV_B32_e32 0x80000000)), // Set sign bit. - sub1) ->; - -def : Pat < - (fabs f32:$src), - (V_AND_B32_e32 $src, (V_MOV_B32_e32 0x7fffffff)) ->; - -def : Pat < - (fneg f32:$src), - (V_XOR_B32_e32 $src, (V_MOV_B32_e32 0x80000000)) ->; - -def : Pat < - (fabs f64:$src), - (REG_SEQUENCE VReg_64, - (i32 (EXTRACT_SUBREG f64:$src, sub0)), - sub0, - (V_AND_B32_e32 (EXTRACT_SUBREG f64:$src, sub1), - (V_MOV_B32_e32 0x7fffffff)), // Set sign bit. - sub1) ->; - -def : Pat < - (fneg f64:$src), - (REG_SEQUENCE VReg_64, - (i32 (EXTRACT_SUBREG f64:$src, sub0)), - sub0, - (V_XOR_B32_e32 (EXTRACT_SUBREG f64:$src, sub1), - (V_MOV_B32_e32 0x80000000)), - sub1) ->; - -/********** ================== **********/ -/********** Immediate Patterns **********/ -/********** ================== **********/ - -def : Pat < - (SGPRImm<(i32 imm)>:$imm), - (S_MOV_B32 imm:$imm) ->; - -def : Pat < - (SGPRImm<(f32 fpimm)>:$imm), - (S_MOV_B32 (f32 (bitcast_fpimm_to_i32 $imm))) ->; - -def : Pat < - (i32 imm:$imm), - (V_MOV_B32_e32 imm:$imm) ->; - -def : Pat < - (f32 fpimm:$imm), - (V_MOV_B32_e32 (f32 (bitcast_fpimm_to_i32 $imm))) ->; - -def : Pat < - (i64 InlineImm:$imm), - (S_MOV_B64 InlineImm:$imm) ->; - -// XXX - Should this use a s_cmp to set SCC? - -// Set to sign-extended 64-bit value (true = -1, false = 0) -def : Pat < - (i1 imm:$imm), - (S_MOV_B64 (i64 (as_i64imm $imm))) ->; - -def : Pat < - (f64 InlineFPImm:$imm), - (S_MOV_B64 (f64 (bitcast_fpimm_to_i64 InlineFPImm:$imm))) ->; - -/********** ================== **********/ -/********** Intrinsic Patterns **********/ -/********** ================== **********/ - -/* llvm.AMDGPU.pow */ -def : POW_Common ; - -def : Pat < - (int_AMDGPU_div f32:$src0, f32:$src1), - (V_MUL_LEGACY_F32_e32 $src0, (V_RCP_LEGACY_F32_e32 $src1)) ->; - -def : Pat < - (int_AMDGPU_cube v4f32:$src), - (REG_SEQUENCE VReg_128, - (V_CUBETC_F32 0 /* src0_modifiers */, (EXTRACT_SUBREG $src, sub0), - 0 /* src1_modifiers */, (EXTRACT_SUBREG $src, sub1), - 0 /* src2_modifiers */, (EXTRACT_SUBREG $src, sub2), - 0 /* clamp */, 0 /* omod */), sub0, - (V_CUBESC_F32 0 /* src0_modifiers */, (EXTRACT_SUBREG $src, sub0), - 0 /* src1_modifiers */,(EXTRACT_SUBREG $src, sub1), - 0 /* src2_modifiers */,(EXTRACT_SUBREG $src, sub2), - 0 /* clamp */, 0 /* omod */), sub1, - (V_CUBEMA_F32 0 /* src1_modifiers */,(EXTRACT_SUBREG $src, sub0), - 0 /* src1_modifiers */,(EXTRACT_SUBREG $src, sub1), - 0 /* src1_modifiers */,(EXTRACT_SUBREG $src, sub2), - 0 /* clamp */, 0 /* omod */), sub2, - (V_CUBEID_F32 0 /* src1_modifiers */,(EXTRACT_SUBREG $src, sub0), - 0 /* src1_modifiers */,(EXTRACT_SUBREG $src, sub1), - 0 /* src1_modifiers */,(EXTRACT_SUBREG $src, sub2), - 0 /* clamp */, 0 /* omod */), sub3) ->; - -def : Pat < - (i32 (sext i1:$src0)), - (V_CNDMASK_B32_e64 (i32 0), (i32 -1), $src0) ->; - -class Ext32Pat : Pat < - (i32 (ext i1:$src0)), - (V_CNDMASK_B32_e64 (i32 0), (i32 1), $src0) ->; - -def : Ext32Pat ; -def : Ext32Pat ; - -// Offset in an 32Bit VGPR -def : Pat < - (SIload_constant v4i32:$sbase, i32:$voff), - (BUFFER_LOAD_DWORD_OFFEN $voff, $sbase, 0, 0, 0, 0, 0) ->; - -// The multiplication scales from [0,1] to the unsigned integer range -def : Pat < - (AMDGPUurecip i32:$src0), - (V_CVT_U32_F32_e32 - (V_MUL_F32_e32 CONST.FP_UINT_MAX_PLUS_1, - (V_RCP_IFLAG_F32_e32 (V_CVT_F32_U32_e32 $src0)))) ->; - -def : Pat < - (int_SI_tid), - (V_MBCNT_HI_U32_B32_e64 0xffffffff, - (V_MBCNT_LO_U32_B32_e64 0xffffffff, 0)) ->; - -//===----------------------------------------------------------------------===// -// VOP3 Patterns -//===----------------------------------------------------------------------===// - -def : IMad24Pat; -def : UMad24Pat; - -def : Pat < - (mulhu i32:$src0, i32:$src1), - (V_MUL_HI_U32 $src0, $src1) ->; - -def : Pat < - (mulhs i32:$src0, i32:$src1), - (V_MUL_HI_I32 $src0, $src1) ->; - -defm : BFIPatterns ; -def : ROTRPattern ; - -/********** ======================= **********/ -/********** Load/Store Patterns **********/ -/********** ======================= **********/ - -class DSReadPat : Pat < - (vt (frag (DS1Addr1Offset i32:$ptr, i32:$offset))), - (inst $ptr, (as_i16imm $offset), (i1 0)) ->; - -def : DSReadPat ; -def : DSReadPat ; -def : DSReadPat ; -def : DSReadPat ; -def : DSReadPat ; - -let AddedComplexity = 100 in { - -def : DSReadPat ; - -} // End AddedComplexity = 100 - -def : Pat < - (v2i32 (si_load_local (DS64Bit4ByteAligned i32:$ptr, i8:$offset0, - i8:$offset1))), - (DS_READ2_B32 $ptr, $offset0, $offset1, (i1 0)) ->; - -class DSWritePat : Pat < - (frag vt:$value, (DS1Addr1Offset i32:$ptr, i32:$offset)), - (inst $ptr, $value, (as_i16imm $offset), (i1 0)) ->; - -def : DSWritePat ; -def : DSWritePat ; -def : DSWritePat ; - -let AddedComplexity = 100 in { - -def : DSWritePat ; -} // End AddedComplexity = 100 - -def : Pat < - (si_store_local v2i32:$value, (DS64Bit4ByteAligned i32:$ptr, i8:$offset0, - i8:$offset1)), - (DS_WRITE2_B32 $ptr, (EXTRACT_SUBREG $value, sub0), - (EXTRACT_SUBREG $value, sub1), $offset0, $offset1, - (i1 0)) ->; - -class DSAtomicRetPat : Pat < - (frag (DS1Addr1Offset i32:$ptr, i32:$offset), vt:$value), - (inst $ptr, $value, (as_i16imm $offset), (i1 0)) ->; - -// Special case of DSAtomicRetPat for add / sub 1 -> inc / dec -// -// We need to use something for the data0, so we set a register to -// -1. For the non-rtn variants, the manual says it does -// DS[A] = (DS[A] >= D0) ? 0 : DS[A] + 1, and setting D0 to uint_max -// will always do the increment so I'm assuming it's the same. -// -// We also load this -1 with s_mov_b32 / s_mov_b64 even though this -// needs to be a VGPR. The SGPR copy pass will fix this, and it's -// easier since there is no v_mov_b64. -class DSAtomicIncRetPat : Pat < - (frag (DS1Addr1Offset i32:$ptr, i32:$offset), (vt 1)), - (inst $ptr, (LoadImm (vt -1)), (as_i16imm $offset), (i1 0)) ->; - - -class DSAtomicCmpXChg : Pat < - (frag (DS1Addr1Offset i32:$ptr, i32:$offset), vt:$cmp, vt:$swap), - (inst $ptr, $cmp, $swap, (as_i16imm $offset), (i1 0)) ->; - - -// 32-bit atomics. -def : DSAtomicIncRetPat; -def : DSAtomicIncRetPat; - -def : DSAtomicRetPat; -def : DSAtomicRetPat; -def : DSAtomicRetPat; -def : DSAtomicRetPat; -def : DSAtomicRetPat; -def : DSAtomicRetPat; -def : DSAtomicRetPat; -def : DSAtomicRetPat; -def : DSAtomicRetPat; -def : DSAtomicRetPat; - -def : DSAtomicCmpXChg; - -// 64-bit atomics. -def : DSAtomicIncRetPat; -def : DSAtomicIncRetPat; - -def : DSAtomicRetPat; -def : DSAtomicRetPat; -def : DSAtomicRetPat; -def : DSAtomicRetPat; -def : DSAtomicRetPat; -def : DSAtomicRetPat; -def : DSAtomicRetPat; -def : DSAtomicRetPat; -def : DSAtomicRetPat; -def : DSAtomicRetPat; - -def : DSAtomicCmpXChg; - - -//===----------------------------------------------------------------------===// -// MUBUF Patterns -//===----------------------------------------------------------------------===// - -multiclass MUBUFLoad_Pattern { - def : Pat < - (vt (constant_ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, - i16:$offset, i1:$glc, i1:$slc, i1:$tfe))), - (Instr_ADDR64 $vaddr, $srsrc, $soffset, $offset, $glc, $slc, $tfe) - >; -} - -let Predicates = [isSICI] in { -defm : MUBUFLoad_Pattern ; -defm : MUBUFLoad_Pattern ; -defm : MUBUFLoad_Pattern ; -defm : MUBUFLoad_Pattern ; -defm : MUBUFLoad_Pattern ; -defm : MUBUFLoad_Pattern ; -defm : MUBUFLoad_Pattern ; -} // End Predicates = [isSICI] - -class MUBUFScratchLoadPat : Pat < - (vt (ld (MUBUFScratch v4i32:$srsrc, i32:$vaddr, - i32:$soffset, u16imm:$offset))), - (Instr $vaddr, $srsrc, $soffset, $offset, 0, 0, 0) ->; - -def : MUBUFScratchLoadPat ; -def : MUBUFScratchLoadPat ; -def : MUBUFScratchLoadPat ; -def : MUBUFScratchLoadPat ; -def : MUBUFScratchLoadPat ; -def : MUBUFScratchLoadPat ; -def : MUBUFScratchLoadPat ; - -// BUFFER_LOAD_DWORD*, addr64=0 -multiclass MUBUF_Load_Dword { - - def : Pat < - (vt (int_SI_buffer_load_dword v4i32:$rsrc, (i32 imm), i32:$soffset, - imm:$offset, 0, 0, imm:$glc, imm:$slc, - imm:$tfe)), - (offset $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $glc), - (as_i1imm $slc), (as_i1imm $tfe)) - >; - - def : Pat < - (vt (int_SI_buffer_load_dword v4i32:$rsrc, i32:$vaddr, i32:$soffset, - imm:$offset, 1, 0, imm:$glc, imm:$slc, - imm:$tfe)), - (offen $vaddr, $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $glc), (as_i1imm $slc), - (as_i1imm $tfe)) - >; - - def : Pat < - (vt (int_SI_buffer_load_dword v4i32:$rsrc, i32:$vaddr, i32:$soffset, - imm:$offset, 0, 1, imm:$glc, imm:$slc, - imm:$tfe)), - (idxen $vaddr, $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $glc), - (as_i1imm $slc), (as_i1imm $tfe)) - >; - - def : Pat < - (vt (int_SI_buffer_load_dword v4i32:$rsrc, v2i32:$vaddr, i32:$soffset, - imm:$offset, 1, 1, imm:$glc, imm:$slc, - imm:$tfe)), - (bothen $vaddr, $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $glc), (as_i1imm $slc), - (as_i1imm $tfe)) - >; -} - -defm : MUBUF_Load_Dword ; -defm : MUBUF_Load_Dword ; -defm : MUBUF_Load_Dword ; - -class MUBUFScratchStorePat : Pat < - (st vt:$value, (MUBUFScratch v4i32:$srsrc, i32:$vaddr, i32:$soffset, - u16imm:$offset)), - (Instr $value, $vaddr, $srsrc, $soffset, $offset, 0, 0, 0) ->; - -def : MUBUFScratchStorePat ; -def : MUBUFScratchStorePat ; -def : MUBUFScratchStorePat ; -def : MUBUFScratchStorePat ; -def : MUBUFScratchStorePat ; - -/* -class MUBUFStore_Pattern : Pat < - (st vt:$value, (MUBUFScratch v4i32:$srsrc, i64:$vaddr, u16imm:$offset)), - (Instr $value, $srsrc, $vaddr, $offset) ->; - -let Predicates = [isSICI] in { -def : MUBUFStore_Pattern ; -def : MUBUFStore_Pattern ; -def : MUBUFStore_Pattern ; -def : MUBUFStore_Pattern ; -def : MUBUFStore_Pattern ; -} // End Predicates = [isSICI] - -*/ - -//===----------------------------------------------------------------------===// -// MTBUF Patterns -//===----------------------------------------------------------------------===// - -// TBUFFER_STORE_FORMAT_*, addr64=0 -class MTBUF_StoreResource : Pat< - (SItbuffer_store v4i32:$rsrc, vt:$vdata, num_channels, i32:$vaddr, - i32:$soffset, imm:$inst_offset, imm:$dfmt, - imm:$nfmt, imm:$offen, imm:$idxen, - imm:$glc, imm:$slc, imm:$tfe), - (opcode - $vdata, (as_i16imm $inst_offset), (as_i1imm $offen), (as_i1imm $idxen), - (as_i1imm $glc), 0, (as_i8imm $dfmt), (as_i8imm $nfmt), $vaddr, $rsrc, - (as_i1imm $slc), (as_i1imm $tfe), $soffset) ->; - -def : MTBUF_StoreResource ; -def : MTBUF_StoreResource ; -def : MTBUF_StoreResource ; -def : MTBUF_StoreResource ; - -let SubtargetPredicate = isCI in { - -defm V_QSAD_PK_U16_U8 : VOP3Inst , "v_qsad_pk_u16_u8", - VOP_I32_I32_I32 ->; -defm V_MQSAD_U16_U8 : VOP3Inst , "v_mqsad_u16_u8", - VOP_I32_I32_I32 ->; -defm V_MQSAD_U32_U8 : VOP3Inst , "v_mqsad_u32_u8", - VOP_I32_I32_I32 ->; - -let isCommutable = 1 in { -defm V_MAD_U64_U32 : VOP3Inst , "v_mad_u64_u32", - VOP_I64_I32_I32_I64 ->; - -// XXX - Does this set VCC? -defm V_MAD_I64_I32 : VOP3Inst , "v_mad_i64_i32", - VOP_I64_I32_I32_I64 ->; -} // End isCommutable = 1 - -// Remaining instructions: -// FLAT_* -// S_CBRANCH_CDBGUSER -// S_CBRANCH_CDBGSYS -// S_CBRANCH_CDBGSYS_OR_USER -// S_CBRANCH_CDBGSYS_AND_USER -// S_DCACHE_INV_VOL -// DS_NOP -// DS_GWS_SEMA_RELEASE_ALL -// DS_WRAP_RTN_B32 -// DS_CNDXCHG32_RTN_B64 -// DS_WRITE_B96 -// DS_WRITE_B128 -// DS_CONDXCHG32_RTN_B128 -// DS_READ_B96 -// DS_READ_B128 -// BUFFER_LOAD_DWORDX3 -// BUFFER_STORE_DWORDX3 - -} // End isCI - -//===----------------------------------------------------------------------===// -// Flat Patterns -//===----------------------------------------------------------------------===// - -class FLATLoad_Pattern : - Pat <(vt (flat_ld i64:$ptr)), - (Instr_ADDR64 $ptr) ->; - -def : FLATLoad_Pattern ; -def : FLATLoad_Pattern ; -def : FLATLoad_Pattern ; -def : FLATLoad_Pattern ; -def : FLATLoad_Pattern ; -def : FLATLoad_Pattern ; -def : FLATLoad_Pattern ; -def : FLATLoad_Pattern ; -def : FLATLoad_Pattern ; - -class FLATStore_Pattern : - Pat <(st vt:$value, i64:$ptr), - (Instr $value, $ptr) - >; - -def : FLATStore_Pattern ; -def : FLATStore_Pattern ; -def : FLATStore_Pattern ; -def : FLATStore_Pattern ; -def : FLATStore_Pattern ; -def : FLATStore_Pattern ; - -/********** ====================== **********/ -/********** Indirect adressing **********/ -/********** ====================== **********/ - -multiclass SI_INDIRECT_Pattern { - - // 1. Extract with offset - def : Pat< - (eltvt (vector_extract vt:$vec, (add i32:$idx, imm:$off))), - (SI_INDIRECT_SRC $vec, $idx, imm:$off) - >; - - // 2. Extract without offset - def : Pat< - (eltvt (vector_extract vt:$vec, i32:$idx)), - (SI_INDIRECT_SRC $vec, $idx, 0) - >; - - // 3. Insert with offset - def : Pat< - (vector_insert vt:$vec, eltvt:$val, (add i32:$idx, imm:$off)), - (IndDst $vec, $idx, imm:$off, $val) - >; - - // 4. Insert without offset - def : Pat< - (vector_insert vt:$vec, eltvt:$val, i32:$idx), - (IndDst $vec, $idx, 0, $val) - >; -} - -defm : SI_INDIRECT_Pattern ; -defm : SI_INDIRECT_Pattern ; -defm : SI_INDIRECT_Pattern ; -defm : SI_INDIRECT_Pattern ; - -defm : SI_INDIRECT_Pattern ; -defm : SI_INDIRECT_Pattern ; -defm : SI_INDIRECT_Pattern ; -defm : SI_INDIRECT_Pattern ; - -//===----------------------------------------------------------------------===// -// Conversion Patterns -//===----------------------------------------------------------------------===// - -def : Pat<(i32 (sext_inreg i32:$src, i1)), - (S_BFE_I32 i32:$src, 65536)>; // 0 | 1 << 16 - -// Handle sext_inreg in i64 -def : Pat < - (i64 (sext_inreg i64:$src, i1)), - (S_BFE_I64 i64:$src, 0x10000) // 0 | 1 << 16 ->; - -def : Pat < - (i64 (sext_inreg i64:$src, i8)), - (S_BFE_I64 i64:$src, 0x80000) // 0 | 8 << 16 ->; - -def : Pat < - (i64 (sext_inreg i64:$src, i16)), - (S_BFE_I64 i64:$src, 0x100000) // 0 | 16 << 16 ->; - -def : Pat < - (i64 (sext_inreg i64:$src, i32)), - (S_BFE_I64 i64:$src, 0x200000) // 0 | 32 << 16 ->; - -class ZExt_i64_i32_Pat : Pat < - (i64 (ext i32:$src)), - (REG_SEQUENCE SReg_64, $src, sub0, (S_MOV_B32 0), sub1) ->; - -class ZExt_i64_i1_Pat : Pat < - (i64 (ext i1:$src)), - (REG_SEQUENCE VReg_64, - (V_CNDMASK_B32_e64 (i32 0), (i32 1), $src), sub0, - (S_MOV_B32 0), sub1) ->; - - -def : ZExt_i64_i32_Pat; -def : ZExt_i64_i32_Pat; -def : ZExt_i64_i1_Pat; -def : ZExt_i64_i1_Pat; - -def : Pat < - (i64 (sext i32:$src)), - (REG_SEQUENCE SReg_64, $src, sub0, - (S_ASHR_I32 $src, 31), sub1) ->; - -def : Pat < - (i64 (sext i1:$src)), - (REG_SEQUENCE VReg_64, - (V_CNDMASK_B32_e64 0, -1, $src), sub0, - (V_CNDMASK_B32_e64 0, -1, $src), sub1) ->; - -// If we need to perform a logical operation on i1 values, we need to -// use vector comparisons since there is only one SCC register. Vector -// comparisions still write to a pair of SGPRs, so treat these as -// 64-bit comparisons. When legalizing SGPR copies, instructions -// resulting in the copies from SCC to these instructions will be -// moved to the VALU. -def : Pat < - (i1 (and i1:$src0, i1:$src1)), - (S_AND_B64 $src0, $src1) ->; - -def : Pat < - (i1 (or i1:$src0, i1:$src1)), - (S_OR_B64 $src0, $src1) ->; - -def : Pat < - (i1 (xor i1:$src0, i1:$src1)), - (S_XOR_B64 $src0, $src1) ->; - -def : Pat < - (f32 (sint_to_fp i1:$src)), - (V_CNDMASK_B32_e64 (i32 0), CONST.FP32_NEG_ONE, $src) ->; - -def : Pat < - (f32 (uint_to_fp i1:$src)), - (V_CNDMASK_B32_e64 (i32 0), CONST.FP32_ONE, $src) ->; - -def : Pat < - (f64 (sint_to_fp i1:$src)), - (V_CVT_F64_I32_e32 (V_CNDMASK_B32_e64 (i32 0), (i32 -1), $src)) ->; - -def : Pat < - (f64 (uint_to_fp i1:$src)), - (V_CVT_F64_U32_e32 (V_CNDMASK_B32_e64 (i32 0), (i32 1), $src)) ->; - -//===----------------------------------------------------------------------===// -// Miscellaneous Patterns -//===----------------------------------------------------------------------===// - -def : Pat < - (i32 (trunc i64:$a)), - (EXTRACT_SUBREG $a, sub0) ->; - -def : Pat < - (i1 (trunc i32:$a)), - (V_CMP_EQ_I32_e64 (V_AND_B32_e64 (i32 1), $a), 1) ->; - -def : Pat < - (i1 (trunc i64:$a)), - (V_CMP_EQ_I32_e64 (V_AND_B32_e64 (i32 1), - (EXTRACT_SUBREG $a, sub0)), 1) ->; - -def : Pat < - (i32 (bswap i32:$a)), - (V_BFI_B32 (S_MOV_B32 0x00ff00ff), - (V_ALIGNBIT_B32 $a, $a, 24), - (V_ALIGNBIT_B32 $a, $a, 8)) ->; - -def : Pat < - (f32 (select i1:$src2, f32:$src1, f32:$src0)), - (V_CNDMASK_B32_e64 $src0, $src1, $src2) ->; - -multiclass BFMPatterns { - def : Pat < - (vt (shl (vt (add (vt (shl 1, vt:$a)), -1)), vt:$b)), - (BFM $a, $b) - >; - - def : Pat < - (vt (add (vt (shl 1, vt:$a)), -1)), - (BFM $a, (MOV 0)) - >; -} - -defm : BFMPatterns ; -// FIXME: defm : BFMPatterns ; - -def : BFEPattern ; - -//===----------------------------------------------------------------------===// -// Fract Patterns -//===----------------------------------------------------------------------===// - -let Predicates = [isSI] in { - -// V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) is -// used instead. However, SI doesn't have V_FLOOR_F64, so the most efficient -// way to implement it is using V_FRACT_F64. -// The workaround for the V_FRACT bug is: -// fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999) - -// Convert (x + (-floor(x)) to fract(x) -def : Pat < - (f64 (fadd (f64 (VOP3Mods f64:$x, i32:$mods)), - (f64 (fneg (f64 (ffloor (f64 (VOP3Mods f64:$x, i32:$mods)))))))), - (V_CNDMASK_B64_PSEUDO - $x, - (V_MIN_F64 - SRCMODS.NONE, - (V_FRACT_F64_e64 $mods, $x, DSTCLAMP.NONE, DSTOMOD.NONE), - SRCMODS.NONE, - (V_MOV_B64_PSEUDO 0x3fefffffffffffff), - DSTCLAMP.NONE, DSTOMOD.NONE), - (V_CMP_CLASS_F64_e64 SRCMODS.NONE, $x, 3/*NaN*/)) ->; - -// Convert floor(x) to (x - fract(x)) -def : Pat < - (f64 (ffloor (f64 (VOP3Mods f64:$x, i32:$mods)))), - (V_ADD_F64 - $mods, - $x, - SRCMODS.NEG, - (V_CNDMASK_B64_PSEUDO - $x, - (V_MIN_F64 - SRCMODS.NONE, - (V_FRACT_F64_e64 $mods, $x, DSTCLAMP.NONE, DSTOMOD.NONE), - SRCMODS.NONE, - (V_MOV_B64_PSEUDO 0x3fefffffffffffff), - DSTCLAMP.NONE, DSTOMOD.NONE), - (V_CMP_CLASS_F64_e64 SRCMODS.NONE, $x, 3/*NaN*/)), - DSTCLAMP.NONE, DSTOMOD.NONE) ->; - -} // End Predicates = [isSI] - -let Predicates = [isCI] in { - -// Convert (x - floor(x)) to fract(x) -def : Pat < - (f32 (fsub (f32 (VOP3Mods f32:$x, i32:$mods)), - (f32 (ffloor (f32 (VOP3Mods f32:$x, i32:$mods)))))), - (V_FRACT_F32_e64 $mods, $x, DSTCLAMP.NONE, DSTOMOD.NONE) ->; - -// Convert (x + (-floor(x))) to fract(x) -def : Pat < - (f64 (fadd (f64 (VOP3Mods f64:$x, i32:$mods)), - (f64 (fneg (f64 (ffloor (f64 (VOP3Mods f64:$x, i32:$mods)))))))), - (V_FRACT_F64_e64 $mods, $x, DSTCLAMP.NONE, DSTOMOD.NONE) ->; - -} // End Predicates = [isCI] - -//============================================================================// -// Miscellaneous Optimization Patterns -//============================================================================// - -def : SHA256MaPattern ; - -//============================================================================// -// Assembler aliases -//============================================================================// - -def : MnemonicAlias<"v_add_u32", "v_add_i32">; -def : MnemonicAlias<"v_sub_u32", "v_sub_i32">; -def : MnemonicAlias<"v_subrev_u32", "v_subrev_i32">; - -} // End isGCN predicate Index: projects/clang370-import/contrib/llvm/lib/Target/R600/R600EmitClauseMarkers.cpp =================================================================== --- projects/clang370-import/contrib/llvm/lib/Target/R600/R600EmitClauseMarkers.cpp (revision 288077) +++ projects/clang370-import/contrib/llvm/lib/Target/R600/R600EmitClauseMarkers.cpp (nonexistent) @@ -1,336 +0,0 @@ -//===-- R600EmitClauseMarkers.cpp - Emit CF_ALU ---------------------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -/// \file -/// Add CF_ALU. R600 Alu instructions are grouped in clause which can hold -/// 128 Alu instructions ; these instructions can access up to 4 prefetched -/// 4 lines of 16 registers from constant buffers. Such ALU clauses are -/// initiated by CF_ALU instructions. -//===----------------------------------------------------------------------===// - -#include "AMDGPU.h" -#include "R600Defines.h" -#include "R600InstrInfo.h" -#include "R600MachineFunctionInfo.h" -#include "R600RegisterInfo.h" -#include "AMDGPUSubtarget.h" -#include "llvm/CodeGen/MachineFunctionPass.h" -#include "llvm/CodeGen/MachineInstrBuilder.h" -#include "llvm/CodeGen/MachineRegisterInfo.h" - -using namespace llvm; - -namespace llvm { - void initializeR600EmitClauseMarkersPass(PassRegistry&); -} - -namespace { - -class R600EmitClauseMarkers : public MachineFunctionPass { - -private: - const R600InstrInfo *TII; - int Address; - - unsigned OccupiedDwords(MachineInstr *MI) const { - switch (MI->getOpcode()) { - case AMDGPU::INTERP_PAIR_XY: - case AMDGPU::INTERP_PAIR_ZW: - case AMDGPU::INTERP_VEC_LOAD: - case AMDGPU::DOT_4: - return 4; - case AMDGPU::KILL: - return 0; - default: - break; - } - - // These will be expanded to two ALU instructions in the - // ExpandSpecialInstructions pass. - if (TII->isLDSRetInstr(MI->getOpcode())) - return 2; - - if(TII->isVector(*MI) || - TII->isCubeOp(MI->getOpcode()) || - TII->isReductionOp(MI->getOpcode())) - return 4; - - unsigned NumLiteral = 0; - for (MachineInstr::mop_iterator It = MI->operands_begin(), - E = MI->operands_end(); It != E; ++It) { - MachineOperand &MO = *It; - if (MO.isReg() && MO.getReg() == AMDGPU::ALU_LITERAL_X) - ++NumLiteral; - } - return 1 + NumLiteral; - } - - bool isALU(const MachineInstr *MI) const { - if (TII->isALUInstr(MI->getOpcode())) - return true; - if (TII->isVector(*MI) || TII->isCubeOp(MI->getOpcode())) - return true; - switch (MI->getOpcode()) { - case AMDGPU::PRED_X: - case AMDGPU::INTERP_PAIR_XY: - case AMDGPU::INTERP_PAIR_ZW: - case AMDGPU::INTERP_VEC_LOAD: - case AMDGPU::COPY: - case AMDGPU::DOT_4: - return true; - default: - return false; - } - } - - bool IsTrivialInst(MachineInstr *MI) const { - switch (MI->getOpcode()) { - case AMDGPU::KILL: - case AMDGPU::RETURN: - case AMDGPU::IMPLICIT_DEF: - return true; - default: - return false; - } - } - - std::pair getAccessedBankLine(unsigned Sel) const { - // Sel is (512 + (kc_bank << 12) + ConstIndex) << 2 - // (See also R600ISelLowering.cpp) - // ConstIndex value is in [0, 4095]; - return std::pair( - ((Sel >> 2) - 512) >> 12, // KC_BANK - // Line Number of ConstIndex - // A line contains 16 constant registers however KCX bank can lock - // two line at the same time ; thus we want to get an even line number. - // Line number can be retrieved with (>>4), using (>>5) <<1 generates - // an even number. - ((((Sel >> 2) - 512) & 4095) >> 5) << 1); - } - - bool SubstituteKCacheBank(MachineInstr *MI, - std::vector > &CachedConsts, - bool UpdateInstr = true) const { - std::vector > UsedKCache; - - if (!TII->isALUInstr(MI->getOpcode()) && MI->getOpcode() != AMDGPU::DOT_4) - return true; - - const SmallVectorImpl > &Consts = - TII->getSrcs(MI); - assert((TII->isALUInstr(MI->getOpcode()) || - MI->getOpcode() == AMDGPU::DOT_4) && "Can't assign Const"); - for (unsigned i = 0, n = Consts.size(); i < n; ++i) { - if (Consts[i].first->getReg() != AMDGPU::ALU_CONST) - continue; - unsigned Sel = Consts[i].second; - unsigned Chan = Sel & 3, Index = ((Sel >> 2) - 512) & 31; - unsigned KCacheIndex = Index * 4 + Chan; - const std::pair &BankLine = getAccessedBankLine(Sel); - if (CachedConsts.empty()) { - CachedConsts.push_back(BankLine); - UsedKCache.push_back(std::pair(0, KCacheIndex)); - continue; - } - if (CachedConsts[0] == BankLine) { - UsedKCache.push_back(std::pair(0, KCacheIndex)); - continue; - } - if (CachedConsts.size() == 1) { - CachedConsts.push_back(BankLine); - UsedKCache.push_back(std::pair(1, KCacheIndex)); - continue; - } - if (CachedConsts[1] == BankLine) { - UsedKCache.push_back(std::pair(1, KCacheIndex)); - continue; - } - return false; - } - - if (!UpdateInstr) - return true; - - for (unsigned i = 0, j = 0, n = Consts.size(); i < n; ++i) { - if (Consts[i].first->getReg() != AMDGPU::ALU_CONST) - continue; - switch(UsedKCache[j].first) { - case 0: - Consts[i].first->setReg( - AMDGPU::R600_KC0RegClass.getRegister(UsedKCache[j].second)); - break; - case 1: - Consts[i].first->setReg( - AMDGPU::R600_KC1RegClass.getRegister(UsedKCache[j].second)); - break; - default: - llvm_unreachable("Wrong Cache Line"); - } - j++; - } - return true; - } - - bool canClauseLocalKillFitInClause( - unsigned AluInstCount, - std::vector > KCacheBanks, - MachineBasicBlock::iterator Def, - MachineBasicBlock::iterator BBEnd) { - const R600RegisterInfo &TRI = TII->getRegisterInfo(); - for (MachineInstr::const_mop_iterator - MOI = Def->operands_begin(), - MOE = Def->operands_end(); MOI != MOE; ++MOI) { - if (!MOI->isReg() || !MOI->isDef() || - TRI.isPhysRegLiveAcrossClauses(MOI->getReg())) - continue; - - // Def defines a clause local register, so check that its use will fit - // in the clause. - unsigned LastUseCount = 0; - for (MachineBasicBlock::iterator UseI = Def; UseI != BBEnd; ++UseI) { - AluInstCount += OccupiedDwords(UseI); - // Make sure we won't need to end the clause due to KCache limitations. - if (!SubstituteKCacheBank(UseI, KCacheBanks, false)) - return false; - - // We have reached the maximum instruction limit before finding the - // use that kills this register, so we cannot use this def in the - // current clause. - if (AluInstCount >= TII->getMaxAlusPerClause()) - return false; - - // Register kill flags have been cleared by the time we get to this - // pass, but it is safe to assume that all uses of this register - // occur in the same basic block as its definition, because - // it is illegal for the scheduler to schedule them in - // different blocks. - if (UseI->findRegisterUseOperandIdx(MOI->getReg())) - LastUseCount = AluInstCount; - - if (UseI != Def && UseI->findRegisterDefOperandIdx(MOI->getReg()) != -1) - break; - } - if (LastUseCount) - return LastUseCount <= TII->getMaxAlusPerClause(); - llvm_unreachable("Clause local register live at end of clause."); - } - return true; - } - - MachineBasicBlock::iterator - MakeALUClause(MachineBasicBlock &MBB, MachineBasicBlock::iterator I) { - MachineBasicBlock::iterator ClauseHead = I; - std::vector > KCacheBanks; - bool PushBeforeModifier = false; - unsigned AluInstCount = 0; - for (MachineBasicBlock::iterator E = MBB.end(); I != E; ++I) { - if (IsTrivialInst(I)) - continue; - if (!isALU(I)) - break; - if (AluInstCount > TII->getMaxAlusPerClause()) - break; - if (I->getOpcode() == AMDGPU::PRED_X) { - // We put PRED_X in its own clause to ensure that ifcvt won't create - // clauses with more than 128 insts. - // IfCvt is indeed checking that "then" and "else" branches of an if - // statement have less than ~60 insts thus converted clauses can't be - // bigger than ~121 insts (predicate setter needs to be in the same - // clause as predicated alus). - if (AluInstCount > 0) - break; - if (TII->getFlagOp(I).getImm() & MO_FLAG_PUSH) - PushBeforeModifier = true; - AluInstCount ++; - continue; - } - // XXX: GROUP_BARRIER instructions cannot be in the same ALU clause as: - // - // * KILL or INTERP instructions - // * Any instruction that sets UPDATE_EXEC_MASK or UPDATE_PRED bits - // * Uses waterfalling (i.e. INDEX_MODE = AR.X) - // - // XXX: These checks have not been implemented yet. - if (TII->mustBeLastInClause(I->getOpcode())) { - I++; - break; - } - - // If this instruction defines a clause local register, make sure - // its use can fit in this clause. - if (!canClauseLocalKillFitInClause(AluInstCount, KCacheBanks, I, E)) - break; - - if (!SubstituteKCacheBank(I, KCacheBanks)) - break; - AluInstCount += OccupiedDwords(I); - } - unsigned Opcode = PushBeforeModifier ? - AMDGPU::CF_ALU_PUSH_BEFORE : AMDGPU::CF_ALU; - BuildMI(MBB, ClauseHead, MBB.findDebugLoc(ClauseHead), TII->get(Opcode)) - // We don't use the ADDR field until R600ControlFlowFinalizer pass, where - // it is safe to assume it is 0. However if we always put 0 here, the ifcvt - // pass may assume that identical ALU clause starter at the beginning of a - // true and false branch can be factorized which is not the case. - .addImm(Address++) // ADDR - .addImm(KCacheBanks.empty()?0:KCacheBanks[0].first) // KB0 - .addImm((KCacheBanks.size() < 2)?0:KCacheBanks[1].first) // KB1 - .addImm(KCacheBanks.empty()?0:2) // KM0 - .addImm((KCacheBanks.size() < 2)?0:2) // KM1 - .addImm(KCacheBanks.empty()?0:KCacheBanks[0].second) // KLINE0 - .addImm((KCacheBanks.size() < 2)?0:KCacheBanks[1].second) // KLINE1 - .addImm(AluInstCount) // COUNT - .addImm(1); // Enabled - return I; - } - -public: - static char ID; - R600EmitClauseMarkers() : MachineFunctionPass(ID), TII(nullptr), Address(0) { - - initializeR600EmitClauseMarkersPass(*PassRegistry::getPassRegistry()); - } - - bool runOnMachineFunction(MachineFunction &MF) override { - TII = static_cast(MF.getSubtarget().getInstrInfo()); - - for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end(); - BB != BB_E; ++BB) { - MachineBasicBlock &MBB = *BB; - MachineBasicBlock::iterator I = MBB.begin(); - if (I->getOpcode() == AMDGPU::CF_ALU) - continue; // BB was already parsed - for (MachineBasicBlock::iterator E = MBB.end(); I != E;) { - if (isALU(I)) - I = MakeALUClause(MBB, I); - else - ++I; - } - } - return false; - } - - const char *getPassName() const override { - return "R600 Emit Clause Markers Pass"; - } -}; - -char R600EmitClauseMarkers::ID = 0; - -} // end anonymous namespace - -INITIALIZE_PASS_BEGIN(R600EmitClauseMarkers, "emitclausemarkers", - "R600 Emit Clause Markters", false, false) -INITIALIZE_PASS_END(R600EmitClauseMarkers, "emitclausemarkers", - "R600 Emit Clause Markters", false, false) - -llvm::FunctionPass *llvm::createR600EmitClauseMarkers() { - return new R600EmitClauseMarkers(); -} - Index: projects/clang370-import/contrib/llvm/lib/Target/R600/R600ExpandSpecialInstrs.cpp =================================================================== --- projects/clang370-import/contrib/llvm/lib/Target/R600/R600ExpandSpecialInstrs.cpp (revision 288077) +++ projects/clang370-import/contrib/llvm/lib/Target/R600/R600ExpandSpecialInstrs.cpp (nonexistent) @@ -1,349 +0,0 @@ -//===-- R600ExpandSpecialInstrs.cpp - Expand special instructions ---------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -/// \file -/// Vector, Reduction, and Cube instructions need to fill the entire instruction -/// group to work correctly. This pass expands these individual instructions -/// into several instructions that will completely fill the instruction group. -// -//===----------------------------------------------------------------------===// - -#include "AMDGPU.h" -#include "R600Defines.h" -#include "R600InstrInfo.h" -#include "R600MachineFunctionInfo.h" -#include "R600RegisterInfo.h" -#include "AMDGPUSubtarget.h" -#include "llvm/CodeGen/MachineFunctionPass.h" -#include "llvm/CodeGen/MachineInstrBuilder.h" -#include "llvm/CodeGen/MachineRegisterInfo.h" - -using namespace llvm; - -namespace { - -class R600ExpandSpecialInstrsPass : public MachineFunctionPass { - -private: - static char ID; - const R600InstrInfo *TII; - - void SetFlagInNewMI(MachineInstr *NewMI, const MachineInstr *OldMI, - unsigned Op); - -public: - R600ExpandSpecialInstrsPass(TargetMachine &tm) : MachineFunctionPass(ID), - TII(nullptr) { } - - bool runOnMachineFunction(MachineFunction &MF) override; - - const char *getPassName() const override { - return "R600 Expand special instructions pass"; - } -}; - -} // End anonymous namespace - -char R600ExpandSpecialInstrsPass::ID = 0; - -FunctionPass *llvm::createR600ExpandSpecialInstrsPass(TargetMachine &TM) { - return new R600ExpandSpecialInstrsPass(TM); -} - -void R600ExpandSpecialInstrsPass::SetFlagInNewMI(MachineInstr *NewMI, - const MachineInstr *OldMI, unsigned Op) { - int OpIdx = TII->getOperandIdx(*OldMI, Op); - if (OpIdx > -1) { - uint64_t Val = OldMI->getOperand(OpIdx).getImm(); - TII->setImmOperand(NewMI, Op, Val); - } -} - -bool R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) { - TII = static_cast(MF.getSubtarget().getInstrInfo()); - - const R600RegisterInfo &TRI = TII->getRegisterInfo(); - - for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end(); - BB != BB_E; ++BB) { - MachineBasicBlock &MBB = *BB; - MachineBasicBlock::iterator I = MBB.begin(); - while (I != MBB.end()) { - MachineInstr &MI = *I; - I = std::next(I); - - // Expand LDS_*_RET instructions - if (TII->isLDSRetInstr(MI.getOpcode())) { - int DstIdx = TII->getOperandIdx(MI.getOpcode(), AMDGPU::OpName::dst); - assert(DstIdx != -1); - MachineOperand &DstOp = MI.getOperand(DstIdx); - MachineInstr *Mov = TII->buildMovInstr(&MBB, I, - DstOp.getReg(), AMDGPU::OQAP); - DstOp.setReg(AMDGPU::OQAP); - int LDSPredSelIdx = TII->getOperandIdx(MI.getOpcode(), - AMDGPU::OpName::pred_sel); - int MovPredSelIdx = TII->getOperandIdx(Mov->getOpcode(), - AMDGPU::OpName::pred_sel); - // Copy the pred_sel bit - Mov->getOperand(MovPredSelIdx).setReg( - MI.getOperand(LDSPredSelIdx).getReg()); - } - - switch (MI.getOpcode()) { - default: break; - // Expand PRED_X to one of the PRED_SET instructions. - case AMDGPU::PRED_X: { - uint64_t Flags = MI.getOperand(3).getImm(); - // The native opcode used by PRED_X is stored as an immediate in the - // third operand. - MachineInstr *PredSet = TII->buildDefaultInstruction(MBB, I, - MI.getOperand(2).getImm(), // opcode - MI.getOperand(0).getReg(), // dst - MI.getOperand(1).getReg(), // src0 - AMDGPU::ZERO); // src1 - TII->addFlag(PredSet, 0, MO_FLAG_MASK); - if (Flags & MO_FLAG_PUSH) { - TII->setImmOperand(PredSet, AMDGPU::OpName::update_exec_mask, 1); - } else { - TII->setImmOperand(PredSet, AMDGPU::OpName::update_pred, 1); - } - MI.eraseFromParent(); - continue; - } - - case AMDGPU::INTERP_PAIR_XY: { - MachineInstr *BMI; - unsigned PReg = AMDGPU::R600_ArrayBaseRegClass.getRegister( - MI.getOperand(2).getImm()); - - for (unsigned Chan = 0; Chan < 4; ++Chan) { - unsigned DstReg; - - if (Chan < 2) - DstReg = MI.getOperand(Chan).getReg(); - else - DstReg = Chan == 2 ? AMDGPU::T0_Z : AMDGPU::T0_W; - - BMI = TII->buildDefaultInstruction(MBB, I, AMDGPU::INTERP_XY, - DstReg, MI.getOperand(3 + (Chan % 2)).getReg(), PReg); - - if (Chan > 0) { - BMI->bundleWithPred(); - } - if (Chan >= 2) - TII->addFlag(BMI, 0, MO_FLAG_MASK); - if (Chan != 3) - TII->addFlag(BMI, 0, MO_FLAG_NOT_LAST); - } - - MI.eraseFromParent(); - continue; - } - - case AMDGPU::INTERP_PAIR_ZW: { - MachineInstr *BMI; - unsigned PReg = AMDGPU::R600_ArrayBaseRegClass.getRegister( - MI.getOperand(2).getImm()); - - for (unsigned Chan = 0; Chan < 4; ++Chan) { - unsigned DstReg; - - if (Chan < 2) - DstReg = Chan == 0 ? AMDGPU::T0_X : AMDGPU::T0_Y; - else - DstReg = MI.getOperand(Chan-2).getReg(); - - BMI = TII->buildDefaultInstruction(MBB, I, AMDGPU::INTERP_ZW, - DstReg, MI.getOperand(3 + (Chan % 2)).getReg(), PReg); - - if (Chan > 0) { - BMI->bundleWithPred(); - } - if (Chan < 2) - TII->addFlag(BMI, 0, MO_FLAG_MASK); - if (Chan != 3) - TII->addFlag(BMI, 0, MO_FLAG_NOT_LAST); - } - - MI.eraseFromParent(); - continue; - } - - case AMDGPU::INTERP_VEC_LOAD: { - const R600RegisterInfo &TRI = TII->getRegisterInfo(); - MachineInstr *BMI; - unsigned PReg = AMDGPU::R600_ArrayBaseRegClass.getRegister( - MI.getOperand(1).getImm()); - unsigned DstReg = MI.getOperand(0).getReg(); - - for (unsigned Chan = 0; Chan < 4; ++Chan) { - BMI = TII->buildDefaultInstruction(MBB, I, AMDGPU::INTERP_LOAD_P0, - TRI.getSubReg(DstReg, TRI.getSubRegFromChannel(Chan)), PReg); - if (Chan > 0) { - BMI->bundleWithPred(); - } - if (Chan != 3) - TII->addFlag(BMI, 0, MO_FLAG_NOT_LAST); - } - - MI.eraseFromParent(); - continue; - } - case AMDGPU::DOT_4: { - - const R600RegisterInfo &TRI = TII->getRegisterInfo(); - - unsigned DstReg = MI.getOperand(0).getReg(); - unsigned DstBase = TRI.getEncodingValue(DstReg) & HW_REG_MASK; - - for (unsigned Chan = 0; Chan < 4; ++Chan) { - bool Mask = (Chan != TRI.getHWRegChan(DstReg)); - unsigned SubDstReg = - AMDGPU::R600_TReg32RegClass.getRegister((DstBase * 4) + Chan); - MachineInstr *BMI = - TII->buildSlotOfVectorInstruction(MBB, &MI, Chan, SubDstReg); - if (Chan > 0) { - BMI->bundleWithPred(); - } - if (Mask) { - TII->addFlag(BMI, 0, MO_FLAG_MASK); - } - if (Chan != 3) - TII->addFlag(BMI, 0, MO_FLAG_NOT_LAST); - unsigned Opcode = BMI->getOpcode(); - // While not strictly necessary from hw point of view, we force - // all src operands of a dot4 inst to belong to the same slot. - unsigned Src0 = BMI->getOperand( - TII->getOperandIdx(Opcode, AMDGPU::OpName::src0)) - .getReg(); - unsigned Src1 = BMI->getOperand( - TII->getOperandIdx(Opcode, AMDGPU::OpName::src1)) - .getReg(); - (void) Src0; - (void) Src1; - if ((TRI.getEncodingValue(Src0) & 0xff) < 127 && - (TRI.getEncodingValue(Src1) & 0xff) < 127) - assert(TRI.getHWRegChan(Src0) == TRI.getHWRegChan(Src1)); - } - MI.eraseFromParent(); - continue; - } - } - - bool IsReduction = TII->isReductionOp(MI.getOpcode()); - bool IsVector = TII->isVector(MI); - bool IsCube = TII->isCubeOp(MI.getOpcode()); - if (!IsReduction && !IsVector && !IsCube) { - continue; - } - - // Expand the instruction - // - // Reduction instructions: - // T0_X = DP4 T1_XYZW, T2_XYZW - // becomes: - // TO_X = DP4 T1_X, T2_X - // TO_Y (write masked) = DP4 T1_Y, T2_Y - // TO_Z (write masked) = DP4 T1_Z, T2_Z - // TO_W (write masked) = DP4 T1_W, T2_W - // - // Vector instructions: - // T0_X = MULLO_INT T1_X, T2_X - // becomes: - // T0_X = MULLO_INT T1_X, T2_X - // T0_Y (write masked) = MULLO_INT T1_X, T2_X - // T0_Z (write masked) = MULLO_INT T1_X, T2_X - // T0_W (write masked) = MULLO_INT T1_X, T2_X - // - // Cube instructions: - // T0_XYZW = CUBE T1_XYZW - // becomes: - // TO_X = CUBE T1_Z, T1_Y - // T0_Y = CUBE T1_Z, T1_X - // T0_Z = CUBE T1_X, T1_Z - // T0_W = CUBE T1_Y, T1_Z - for (unsigned Chan = 0; Chan < 4; Chan++) { - unsigned DstReg = MI.getOperand( - TII->getOperandIdx(MI, AMDGPU::OpName::dst)).getReg(); - unsigned Src0 = MI.getOperand( - TII->getOperandIdx(MI, AMDGPU::OpName::src0)).getReg(); - unsigned Src1 = 0; - - // Determine the correct source registers - if (!IsCube) { - int Src1Idx = TII->getOperandIdx(MI, AMDGPU::OpName::src1); - if (Src1Idx != -1) { - Src1 = MI.getOperand(Src1Idx).getReg(); - } - } - if (IsReduction) { - unsigned SubRegIndex = TRI.getSubRegFromChannel(Chan); - Src0 = TRI.getSubReg(Src0, SubRegIndex); - Src1 = TRI.getSubReg(Src1, SubRegIndex); - } else if (IsCube) { - static const int CubeSrcSwz[] = {2, 2, 0, 1}; - unsigned SubRegIndex0 = TRI.getSubRegFromChannel(CubeSrcSwz[Chan]); - unsigned SubRegIndex1 = TRI.getSubRegFromChannel(CubeSrcSwz[3 - Chan]); - Src1 = TRI.getSubReg(Src0, SubRegIndex1); - Src0 = TRI.getSubReg(Src0, SubRegIndex0); - } - - // Determine the correct destination registers; - bool Mask = false; - bool NotLast = true; - if (IsCube) { - unsigned SubRegIndex = TRI.getSubRegFromChannel(Chan); - DstReg = TRI.getSubReg(DstReg, SubRegIndex); - } else { - // Mask the write if the original instruction does not write to - // the current Channel. - Mask = (Chan != TRI.getHWRegChan(DstReg)); - unsigned DstBase = TRI.getEncodingValue(DstReg) & HW_REG_MASK; - DstReg = AMDGPU::R600_TReg32RegClass.getRegister((DstBase * 4) + Chan); - } - - // Set the IsLast bit - NotLast = (Chan != 3 ); - - // Add the new instruction - unsigned Opcode = MI.getOpcode(); - switch (Opcode) { - case AMDGPU::CUBE_r600_pseudo: - Opcode = AMDGPU::CUBE_r600_real; - break; - case AMDGPU::CUBE_eg_pseudo: - Opcode = AMDGPU::CUBE_eg_real; - break; - default: - break; - } - - MachineInstr *NewMI = - TII->buildDefaultInstruction(MBB, I, Opcode, DstReg, Src0, Src1); - - if (Chan != 0) - NewMI->bundleWithPred(); - if (Mask) { - TII->addFlag(NewMI, 0, MO_FLAG_MASK); - } - if (NotLast) { - TII->addFlag(NewMI, 0, MO_FLAG_NOT_LAST); - } - SetFlagInNewMI(NewMI, &MI, AMDGPU::OpName::clamp); - SetFlagInNewMI(NewMI, &MI, AMDGPU::OpName::literal); - SetFlagInNewMI(NewMI, &MI, AMDGPU::OpName::src0_abs); - SetFlagInNewMI(NewMI, &MI, AMDGPU::OpName::src1_abs); - SetFlagInNewMI(NewMI, &MI, AMDGPU::OpName::src0_neg); - SetFlagInNewMI(NewMI, &MI, AMDGPU::OpName::src1_neg); - } - MI.eraseFromParent(); - } - } - return false; -} Index: projects/clang370-import/contrib/llvm/lib/Target/R600/R600Intrinsics.td =================================================================== --- projects/clang370-import/contrib/llvm/lib/Target/R600/R600Intrinsics.td (revision 288077) +++ projects/clang370-import/contrib/llvm/lib/Target/R600/R600Intrinsics.td (nonexistent) @@ -1,75 +0,0 @@ -//===-- R600Intrinsics.td - R600 Instrinsic defs -------*- tablegen -*-----===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// R600 Intrinsic Definitions -// -//===----------------------------------------------------------------------===// - -let TargetPrefix = "R600", isTarget = 1 in { - class TextureIntrinsicFloatInput : - Intrinsic<[llvm_v4f32_ty], [ - llvm_v4f32_ty, // Coord - llvm_i32_ty, // offset_x - llvm_i32_ty, // offset_y, - llvm_i32_ty, // offset_z, - llvm_i32_ty, // resource_id - llvm_i32_ty, // samplerid - llvm_i32_ty, // coord_type_x - llvm_i32_ty, // coord_type_y - llvm_i32_ty, // coord_type_z - llvm_i32_ty // coord_type_w - ], [IntrNoMem]>; - class TextureIntrinsicInt32Input : - Intrinsic<[llvm_v4i32_ty], [ - llvm_v4i32_ty, // Coord - llvm_i32_ty, // offset_x - llvm_i32_ty, // offset_y, - llvm_i32_ty, // offset_z, - llvm_i32_ty, // resource_id - llvm_i32_ty, // samplerid - llvm_i32_ty, // coord_type_x - llvm_i32_ty, // coord_type_y - llvm_i32_ty, // coord_type_z - llvm_i32_ty // coord_type_w - ], [IntrNoMem]>; - - def int_R600_load_input : - Intrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrNoMem]>; - def int_R600_interp_input : - Intrinsic<[llvm_float_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; - def int_R600_interp_const : - Intrinsic<[llvm_v4f32_ty], [llvm_i32_ty], [IntrNoMem]>; -def int_R600_interp_xy : - Intrinsic<[llvm_v2f32_ty], [llvm_i32_ty, llvm_float_ty, llvm_float_ty], [IntrNoMem]>; -def int_R600_interp_zw : - Intrinsic<[llvm_v2f32_ty], [llvm_i32_ty, llvm_float_ty, llvm_float_ty], [IntrNoMem]>; - def int_R600_load_texbuf : - Intrinsic<[llvm_v4f32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; - def int_R600_tex : TextureIntrinsicFloatInput; - def int_R600_texc : TextureIntrinsicFloatInput; - def int_R600_txl : TextureIntrinsicFloatInput; - def int_R600_txlc : TextureIntrinsicFloatInput; - def int_R600_txb : TextureIntrinsicFloatInput; - def int_R600_txbc : TextureIntrinsicFloatInput; - def int_R600_txf : TextureIntrinsicInt32Input; - def int_R600_ldptr : TextureIntrinsicInt32Input; - def int_R600_txq : TextureIntrinsicInt32Input; - def int_R600_ddx : TextureIntrinsicFloatInput; - def int_R600_ddy : TextureIntrinsicFloatInput; - def int_R600_store_swizzle : - Intrinsic<[], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty], []>; - def int_R600_store_stream_output : - Intrinsic<[], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>; - def int_R600_store_pixel_depth : - Intrinsic<[], [llvm_float_ty], []>; - def int_R600_store_pixel_stencil : - Intrinsic<[], [llvm_float_ty], []>; - def int_R600_store_dummy : - Intrinsic<[], [llvm_i32_ty], []>; -} Index: projects/clang370-import/contrib/llvm/lib/Target/R600/SIMachineFunctionInfo.h =================================================================== --- projects/clang370-import/contrib/llvm/lib/Target/R600/SIMachineFunctionInfo.h (revision 288077) +++ projects/clang370-import/contrib/llvm/lib/Target/R600/SIMachineFunctionInfo.h (nonexistent) @@ -1,66 +0,0 @@ -//===- SIMachineFunctionInfo.h - SIMachineFunctionInfo interface -*- C++ -*-==// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -/// \file -// -//===----------------------------------------------------------------------===// - - -#ifndef LLVM_LIB_TARGET_R600_SIMACHINEFUNCTIONINFO_H -#define LLVM_LIB_TARGET_R600_SIMACHINEFUNCTIONINFO_H - -#include "AMDGPUMachineFunction.h" -#include "SIRegisterInfo.h" -#include - -namespace llvm { - -class MachineRegisterInfo; - -/// This class keeps track of the SPI_SP_INPUT_ADDR config register, which -/// tells the hardware which interpolation parameters to load. -class SIMachineFunctionInfo : public AMDGPUMachineFunction { - void anchor() override; - - unsigned TIDReg; - bool HasSpilledVGPRs; - -public: - - struct SpilledReg { - unsigned VGPR; - int Lane; - SpilledReg(unsigned R, int L) : VGPR (R), Lane (L) { } - SpilledReg() : VGPR(0), Lane(-1) { } - bool hasLane() { return Lane != -1;} - }; - - // SIMachineFunctionInfo definition - - SIMachineFunctionInfo(const MachineFunction &MF); - SpilledReg getSpilledReg(MachineFunction *MF, unsigned FrameIndex, - unsigned SubIdx); - unsigned PSInputAddr; - unsigned NumUserSGPRs; - std::map LaneVGPRs; - unsigned LDSWaveSpillSize; - unsigned ScratchOffsetReg; - bool hasCalculatedTID() const { return TIDReg != AMDGPU::NoRegister; }; - unsigned getTIDReg() const { return TIDReg; }; - void setTIDReg(unsigned Reg) { TIDReg = Reg; } - bool hasSpilledVGPRs() const { return HasSpilledVGPRs; } - void setHasSpilledVGPRs(bool Spill = true) { HasSpilledVGPRs = Spill; } - - unsigned getMaximumWorkGroupSize(const MachineFunction &MF) const; -}; - -} // End namespace llvm - - -#endif Index: projects/clang370-import/contrib/llvm/lib/Target/R600/AMDGPUInstrInfo.h =================================================================== --- projects/clang370-import/contrib/llvm/lib/Target/R600/AMDGPUInstrInfo.h (revision 288077) +++ projects/clang370-import/contrib/llvm/lib/Target/R600/AMDGPUInstrInfo.h (nonexistent) @@ -1,206 +0,0 @@ -//===-- AMDGPUInstrInfo.h - AMDGPU Instruction Information ------*- C++ -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -/// \file -/// \brief Contains the definition of a TargetInstrInfo class that is common -/// to all AMD GPUs. -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIB_TARGET_R600_AMDGPUINSTRINFO_H -#define LLVM_LIB_TARGET_R600_AMDGPUINSTRINFO_H - -#include "AMDGPURegisterInfo.h" -#include "llvm/Target/TargetInstrInfo.h" -#include - -#define GET_INSTRINFO_HEADER -#define GET_INSTRINFO_ENUM -#define GET_INSTRINFO_OPERAND_ENUM -#include "AMDGPUGenInstrInfo.inc" - -#define OPCODE_IS_ZERO_INT AMDGPU::PRED_SETE_INT -#define OPCODE_IS_NOT_ZERO_INT AMDGPU::PRED_SETNE_INT -#define OPCODE_IS_ZERO AMDGPU::PRED_SETE -#define OPCODE_IS_NOT_ZERO AMDGPU::PRED_SETNE - -namespace llvm { - -class AMDGPUSubtarget; -class MachineFunction; -class MachineInstr; -class MachineInstrBuilder; - -class AMDGPUInstrInfo : public AMDGPUGenInstrInfo { -private: - const AMDGPURegisterInfo RI; - virtual void anchor(); -protected: - const AMDGPUSubtarget &ST; -public: - explicit AMDGPUInstrInfo(const AMDGPUSubtarget &st); - - virtual const AMDGPURegisterInfo &getRegisterInfo() const = 0; - - bool isCoalescableExtInstr(const MachineInstr &MI, unsigned &SrcReg, - unsigned &DstReg, unsigned &SubIdx) const override; - - unsigned isLoadFromStackSlot(const MachineInstr *MI, - int &FrameIndex) const override; - unsigned isLoadFromStackSlotPostFE(const MachineInstr *MI, - int &FrameIndex) const override; - bool hasLoadFromStackSlot(const MachineInstr *MI, - const MachineMemOperand *&MMO, - int &FrameIndex) const override; - unsigned isStoreFromStackSlot(const MachineInstr *MI, int &FrameIndex) const; - unsigned isStoreFromStackSlotPostFE(const MachineInstr *MI, - int &FrameIndex) const; - bool hasStoreFromStackSlot(const MachineInstr *MI, - const MachineMemOperand *&MMO, - int &FrameIndex) const; - - MachineInstr * - convertToThreeAddress(MachineFunction::iterator &MFI, - MachineBasicBlock::iterator &MBBI, - LiveVariables *LV) const override; - - - bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const override; - - void storeRegToStackSlot(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MI, - unsigned SrcReg, bool isKill, int FrameIndex, - const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI) const override; - void loadRegFromStackSlot(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MI, - unsigned DestReg, int FrameIndex, - const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI) const override; - -protected: - MachineInstr *foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI, - ArrayRef Ops, - MachineBasicBlock::iterator InsertPt, - int FrameIndex) const override; - MachineInstr *foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI, - ArrayRef Ops, - MachineBasicBlock::iterator InsertPt, - MachineInstr *LoadMI) const override; - -public: - /// \returns the smallest register index that will be accessed by an indirect - /// read or write or -1 if indirect addressing is not used by this program. - int getIndirectIndexBegin(const MachineFunction &MF) const; - - /// \returns the largest register index that will be accessed by an indirect - /// read or write or -1 if indirect addressing is not used by this program. - int getIndirectIndexEnd(const MachineFunction &MF) const; - - bool canFoldMemoryOperand(const MachineInstr *MI, - ArrayRef Ops) const override; - bool unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI, - unsigned Reg, bool UnfoldLoad, bool UnfoldStore, - SmallVectorImpl &NewMIs) const override; - bool unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N, - SmallVectorImpl &NewNodes) const override; - unsigned getOpcodeAfterMemoryUnfold(unsigned Opc, - bool UnfoldLoad, bool UnfoldStore, - unsigned *LoadRegIndex = nullptr) const override; - - bool enableClusterLoads() const override; - - bool shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2, - int64_t Offset1, int64_t Offset2, - unsigned NumLoads) const override; - - bool - ReverseBranchCondition(SmallVectorImpl &Cond) const override; - void insertNoop(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MI) const override; - bool isPredicated(const MachineInstr *MI) const override; - bool SubsumesPredicate(const SmallVectorImpl &Pred1, - const SmallVectorImpl &Pred2) const override; - bool DefinesPredicate(MachineInstr *MI, - std::vector &Pred) const override; - bool isPredicable(MachineInstr *MI) const override; - bool isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const override; - - // Helper functions that check the opcode for status information - bool isRegisterStore(const MachineInstr &MI) const; - bool isRegisterLoad(const MachineInstr &MI) const; - - /// \brief Return a target-specific opcode if Opcode is a pseudo instruction. - /// Return -1 if the target-specific opcode for the pseudo instruction does - /// not exist. If Opcode is not a pseudo instruction, this is identity. - int pseudoToMCOpcode(int Opcode) const; - - /// \brief Return the descriptor of the target-specific machine instruction - /// that corresponds to the specified pseudo or native opcode. - const MCInstrDesc &getMCOpcodeFromPseudo(unsigned Opcode) const { - return get(pseudoToMCOpcode(Opcode)); - } - -//===---------------------------------------------------------------------===// -// Pure virtual funtions to be implemented by sub-classes. -//===---------------------------------------------------------------------===// - - virtual bool isMov(unsigned opcode) const = 0; - - /// \brief Calculate the "Indirect Address" for the given \p RegIndex and - /// \p Channel - /// - /// We model indirect addressing using a virtual address space that can be - /// accesed with loads and stores. The "Indirect Address" is the memory - /// address in this virtual address space that maps to the given \p RegIndex - /// and \p Channel. - virtual unsigned calculateIndirectAddress(unsigned RegIndex, - unsigned Channel) const = 0; - - /// \returns The register class to be used for loading and storing values - /// from an "Indirect Address" . - virtual const TargetRegisterClass *getIndirectAddrRegClass() const = 0; - - /// \brief Build instruction(s) for an indirect register write. - /// - /// \returns The instruction that performs the indirect register write - virtual MachineInstrBuilder buildIndirectWrite(MachineBasicBlock *MBB, - MachineBasicBlock::iterator I, - unsigned ValueReg, unsigned Address, - unsigned OffsetReg) const = 0; - - /// \brief Build instruction(s) for an indirect register read. - /// - /// \returns The instruction that performs the indirect register read - virtual MachineInstrBuilder buildIndirectRead(MachineBasicBlock *MBB, - MachineBasicBlock::iterator I, - unsigned ValueReg, unsigned Address, - unsigned OffsetReg) const = 0; - - /// \brief Build a MOV instruction. - virtual MachineInstr *buildMovInstr(MachineBasicBlock *MBB, - MachineBasicBlock::iterator I, - unsigned DstReg, unsigned SrcReg) const = 0; - - /// \brief Given a MIMG \p Opcode that writes all 4 channels, return the - /// equivalent opcode that writes \p Channels Channels. - int getMaskedMIMGOp(uint16_t Opcode, unsigned Channels) const; - -}; - -namespace AMDGPU { - int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIndex); -} // End namespace AMDGPU - -} // End llvm namespace - -#define AMDGPU_FLAG_REGISTER_LOAD (UINT64_C(1) << 63) -#define AMDGPU_FLAG_REGISTER_STORE (UINT64_C(1) << 62) - -#endif Index: projects/clang370-import/contrib/llvm/lib/Target/R600/AMDGPUISelDAGToDAG.cpp =================================================================== --- projects/clang370-import/contrib/llvm/lib/Target/R600/AMDGPUISelDAGToDAG.cpp (revision 288077) +++ projects/clang370-import/contrib/llvm/lib/Target/R600/AMDGPUISelDAGToDAG.cpp (nonexistent) @@ -1,1371 +0,0 @@ -//===-- AMDILISelDAGToDAG.cpp - A dag to dag inst selector for AMDIL ------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//==-----------------------------------------------------------------------===// -// -/// \file -/// \brief Defines an instruction selector for the AMDGPU target. -// -//===----------------------------------------------------------------------===// -#include "AMDGPUInstrInfo.h" -#include "AMDGPUISelLowering.h" // For AMDGPUISD -#include "AMDGPURegisterInfo.h" -#include "AMDGPUSubtarget.h" -#include "R600InstrInfo.h" -#include "SIDefines.h" -#include "SIISelLowering.h" -#include "SIMachineFunctionInfo.h" -#include "llvm/CodeGen/FunctionLoweringInfo.h" -#include "llvm/CodeGen/PseudoSourceValue.h" -#include "llvm/CodeGen/MachineFrameInfo.h" -#include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/CodeGen/SelectionDAG.h" -#include "llvm/CodeGen/SelectionDAGISel.h" -#include "llvm/IR/Function.h" - -using namespace llvm; - -//===----------------------------------------------------------------------===// -// Instruction Selector Implementation -//===----------------------------------------------------------------------===// - -namespace { -/// AMDGPU specific code to select AMDGPU machine instructions for -/// SelectionDAG operations. -class AMDGPUDAGToDAGISel : public SelectionDAGISel { - // Subtarget - Keep a pointer to the AMDGPU Subtarget around so that we can - // make the right decision when generating code for different targets. - const AMDGPUSubtarget *Subtarget; -public: - AMDGPUDAGToDAGISel(TargetMachine &TM); - virtual ~AMDGPUDAGToDAGISel(); - bool runOnMachineFunction(MachineFunction &MF) override; - SDNode *Select(SDNode *N) override; - const char *getPassName() const override; - void PostprocessISelDAG() override; - -private: - bool isInlineImmediate(SDNode *N) const; - bool FoldOperand(SDValue &Src, SDValue &Sel, SDValue &Neg, SDValue &Abs, - const R600InstrInfo *TII); - bool FoldOperands(unsigned, const R600InstrInfo *, std::vector &); - bool FoldDotOperands(unsigned, const R600InstrInfo *, std::vector &); - - // Complex pattern selectors - bool SelectADDRParam(SDValue Addr, SDValue& R1, SDValue& R2); - bool SelectADDR(SDValue N, SDValue &R1, SDValue &R2); - bool SelectADDR64(SDValue N, SDValue &R1, SDValue &R2); - - static bool checkType(const Value *ptr, unsigned int addrspace); - static bool checkPrivateAddress(const MachineMemOperand *Op); - - static bool isGlobalStore(const StoreSDNode *N); - static bool isFlatStore(const StoreSDNode *N); - static bool isPrivateStore(const StoreSDNode *N); - static bool isLocalStore(const StoreSDNode *N); - static bool isRegionStore(const StoreSDNode *N); - - bool isCPLoad(const LoadSDNode *N) const; - bool isConstantLoad(const LoadSDNode *N, int cbID) const; - bool isGlobalLoad(const LoadSDNode *N) const; - bool isFlatLoad(const LoadSDNode *N) const; - bool isParamLoad(const LoadSDNode *N) const; - bool isPrivateLoad(const LoadSDNode *N) const; - bool isLocalLoad(const LoadSDNode *N) const; - bool isRegionLoad(const LoadSDNode *N) const; - - SDNode *glueCopyToM0(SDNode *N) const; - - const TargetRegisterClass *getOperandRegClass(SDNode *N, unsigned OpNo) const; - bool SelectGlobalValueConstantOffset(SDValue Addr, SDValue& IntPtr); - bool SelectGlobalValueVariableOffset(SDValue Addr, SDValue &BaseReg, - SDValue& Offset); - bool SelectADDRVTX_READ(SDValue Addr, SDValue &Base, SDValue &Offset); - bool SelectADDRIndirect(SDValue Addr, SDValue &Base, SDValue &Offset); - bool isDSOffsetLegal(const SDValue &Base, unsigned Offset, - unsigned OffsetBits) const; - bool SelectDS1Addr1Offset(SDValue Ptr, SDValue &Base, SDValue &Offset) const; - bool SelectDS64Bit4ByteAligned(SDValue Ptr, SDValue &Base, SDValue &Offset0, - SDValue &Offset1) const; - void SelectMUBUF(SDValue Addr, SDValue &SRsrc, SDValue &VAddr, - SDValue &SOffset, SDValue &Offset, SDValue &Offen, - SDValue &Idxen, SDValue &Addr64, SDValue &GLC, SDValue &SLC, - SDValue &TFE) const; - bool SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, SDValue &VAddr, - SDValue &SOffset, SDValue &Offset, SDValue &GLC, - SDValue &SLC, SDValue &TFE) const; - bool SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, - SDValue &VAddr, SDValue &SOffset, SDValue &Offset, - SDValue &SLC) const; - bool SelectMUBUFScratch(SDValue Addr, SDValue &RSrc, SDValue &VAddr, - SDValue &SOffset, SDValue &ImmOffset) const; - bool SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &SOffset, - SDValue &Offset, SDValue &GLC, SDValue &SLC, - SDValue &TFE) const; - bool SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &Soffset, - SDValue &Offset, SDValue &GLC) const; - SDNode *SelectAddrSpaceCast(SDNode *N); - bool SelectVOP3Mods(SDValue In, SDValue &Src, SDValue &SrcMods) const; - bool SelectVOP3Mods0(SDValue In, SDValue &Src, SDValue &SrcMods, - SDValue &Clamp, SDValue &Omod) const; - - bool SelectVOP3Mods0Clamp(SDValue In, SDValue &Src, SDValue &SrcMods, - SDValue &Omod) const; - bool SelectVOP3Mods0Clamp0OMod(SDValue In, SDValue &Src, SDValue &SrcMods, - SDValue &Clamp, - SDValue &Omod) const; - - SDNode *SelectADD_SUB_I64(SDNode *N); - SDNode *SelectDIV_SCALE(SDNode *N); - - SDNode *getS_BFE(unsigned Opcode, SDLoc DL, SDValue Val, - uint32_t Offset, uint32_t Width); - SDNode *SelectS_BFEFromShifts(SDNode *N); - SDNode *SelectS_BFE(SDNode *N); - - // Include the pieces autogenerated from the target description. -#include "AMDGPUGenDAGISel.inc" -}; -} // end anonymous namespace - -/// \brief This pass converts a legalized DAG into a AMDGPU-specific -// DAG, ready for instruction scheduling. -FunctionPass *llvm::createAMDGPUISelDag(TargetMachine &TM) { - return new AMDGPUDAGToDAGISel(TM); -} - -AMDGPUDAGToDAGISel::AMDGPUDAGToDAGISel(TargetMachine &TM) - : SelectionDAGISel(TM) {} - -bool AMDGPUDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) { - Subtarget = &static_cast(MF.getSubtarget()); - return SelectionDAGISel::runOnMachineFunction(MF); -} - -AMDGPUDAGToDAGISel::~AMDGPUDAGToDAGISel() { -} - -bool AMDGPUDAGToDAGISel::isInlineImmediate(SDNode *N) const { - const SITargetLowering *TL - = static_cast(getTargetLowering()); - return TL->analyzeImmediate(N) == 0; -} - -/// \brief Determine the register class for \p OpNo -/// \returns The register class of the virtual register that will be used for -/// the given operand number \OpNo or NULL if the register class cannot be -/// determined. -const TargetRegisterClass *AMDGPUDAGToDAGISel::getOperandRegClass(SDNode *N, - unsigned OpNo) const { - if (!N->isMachineOpcode()) - return nullptr; - - switch (N->getMachineOpcode()) { - default: { - const MCInstrDesc &Desc = - Subtarget->getInstrInfo()->get(N->getMachineOpcode()); - unsigned OpIdx = Desc.getNumDefs() + OpNo; - if (OpIdx >= Desc.getNumOperands()) - return nullptr; - int RegClass = Desc.OpInfo[OpIdx].RegClass; - if (RegClass == -1) - return nullptr; - - return Subtarget->getRegisterInfo()->getRegClass(RegClass); - } - case AMDGPU::REG_SEQUENCE: { - unsigned RCID = cast(N->getOperand(0))->getZExtValue(); - const TargetRegisterClass *SuperRC = - Subtarget->getRegisterInfo()->getRegClass(RCID); - - SDValue SubRegOp = N->getOperand(OpNo + 1); - unsigned SubRegIdx = cast(SubRegOp)->getZExtValue(); - return Subtarget->getRegisterInfo()->getSubClassWithSubReg(SuperRC, - SubRegIdx); - } - } -} - -bool AMDGPUDAGToDAGISel::SelectADDRParam( - SDValue Addr, SDValue& R1, SDValue& R2) { - - if (Addr.getOpcode() == ISD::FrameIndex) { - if (FrameIndexSDNode *FIN = dyn_cast(Addr)) { - R1 = CurDAG->getTargetFrameIndex(FIN->getIndex(), MVT::i32); - R2 = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i32); - } else { - R1 = Addr; - R2 = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i32); - } - } else if (Addr.getOpcode() == ISD::ADD) { - R1 = Addr.getOperand(0); - R2 = Addr.getOperand(1); - } else { - R1 = Addr; - R2 = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i32); - } - return true; -} - -bool AMDGPUDAGToDAGISel::SelectADDR(SDValue Addr, SDValue& R1, SDValue& R2) { - if (Addr.getOpcode() == ISD::TargetExternalSymbol || - Addr.getOpcode() == ISD::TargetGlobalAddress) { - return false; - } - return SelectADDRParam(Addr, R1, R2); -} - - -bool AMDGPUDAGToDAGISel::SelectADDR64(SDValue Addr, SDValue& R1, SDValue& R2) { - if (Addr.getOpcode() == ISD::TargetExternalSymbol || - Addr.getOpcode() == ISD::TargetGlobalAddress) { - return false; - } - - if (Addr.getOpcode() == ISD::FrameIndex) { - if (FrameIndexSDNode *FIN = dyn_cast(Addr)) { - R1 = CurDAG->getTargetFrameIndex(FIN->getIndex(), MVT::i64); - R2 = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i64); - } else { - R1 = Addr; - R2 = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i64); - } - } else if (Addr.getOpcode() == ISD::ADD) { - R1 = Addr.getOperand(0); - R2 = Addr.getOperand(1); - } else { - R1 = Addr; - R2 = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i64); - } - return true; -} - -SDNode *AMDGPUDAGToDAGISel::glueCopyToM0(SDNode *N) const { - if (Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS || - !checkType(cast(N)->getMemOperand()->getValue(), - AMDGPUAS::LOCAL_ADDRESS)) - return N; - - const SITargetLowering& Lowering = - *static_cast(getTargetLowering()); - - // Write max value to m0 before each load operation - - SDValue M0 = Lowering.copyToM0(*CurDAG, CurDAG->getEntryNode(), SDLoc(N), - CurDAG->getTargetConstant(-1, SDLoc(N), MVT::i32)); - - SDValue Glue = M0.getValue(1); - - SmallVector Ops; - for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) { - Ops.push_back(N->getOperand(i)); - } - Ops.push_back(Glue); - CurDAG->MorphNodeTo(N, N->getOpcode(), N->getVTList(), Ops); - - return N; -} - -SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) { - unsigned int Opc = N->getOpcode(); - if (N->isMachineOpcode()) { - N->setNodeId(-1); - return nullptr; // Already selected. - } - - if (isa(N)) - N = glueCopyToM0(N); - - switch (Opc) { - default: break; - // We are selecting i64 ADD here instead of custom lower it during - // DAG legalization, so we can fold some i64 ADDs used for address - // calculation into the LOAD and STORE instructions. - case ISD::ADD: - case ISD::SUB: { - if (N->getValueType(0) != MVT::i64 || - Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS) - break; - - return SelectADD_SUB_I64(N); - } - case ISD::SCALAR_TO_VECTOR: - case AMDGPUISD::BUILD_VERTICAL_VECTOR: - case ISD::BUILD_VECTOR: { - unsigned RegClassID; - const AMDGPURegisterInfo *TRI = Subtarget->getRegisterInfo(); - EVT VT = N->getValueType(0); - unsigned NumVectorElts = VT.getVectorNumElements(); - EVT EltVT = VT.getVectorElementType(); - assert(EltVT.bitsEq(MVT::i32)); - if (Subtarget->getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) { - bool UseVReg = true; - for (SDNode::use_iterator U = N->use_begin(), E = SDNode::use_end(); - U != E; ++U) { - if (!U->isMachineOpcode()) { - continue; - } - const TargetRegisterClass *RC = getOperandRegClass(*U, U.getOperandNo()); - if (!RC) { - continue; - } - if (static_cast(TRI)->isSGPRClass(RC)) { - UseVReg = false; - } - } - switch(NumVectorElts) { - case 1: RegClassID = UseVReg ? AMDGPU::VGPR_32RegClassID : - AMDGPU::SReg_32RegClassID; - break; - case 2: RegClassID = UseVReg ? AMDGPU::VReg_64RegClassID : - AMDGPU::SReg_64RegClassID; - break; - case 4: RegClassID = UseVReg ? AMDGPU::VReg_128RegClassID : - AMDGPU::SReg_128RegClassID; - break; - case 8: RegClassID = UseVReg ? AMDGPU::VReg_256RegClassID : - AMDGPU::SReg_256RegClassID; - break; - case 16: RegClassID = UseVReg ? AMDGPU::VReg_512RegClassID : - AMDGPU::SReg_512RegClassID; - break; - default: llvm_unreachable("Do not know how to lower this BUILD_VECTOR"); - } - } else { - // BUILD_VECTOR was lowered into an IMPLICIT_DEF + 4 INSERT_SUBREG - // that adds a 128 bits reg copy when going through TwoAddressInstructions - // pass. We want to avoid 128 bits copies as much as possible because they - // can't be bundled by our scheduler. - switch(NumVectorElts) { - case 2: RegClassID = AMDGPU::R600_Reg64RegClassID; break; - case 4: - if (Opc == AMDGPUISD::BUILD_VERTICAL_VECTOR) - RegClassID = AMDGPU::R600_Reg128VerticalRegClassID; - else - RegClassID = AMDGPU::R600_Reg128RegClassID; - break; - default: llvm_unreachable("Do not know how to lower this BUILD_VECTOR"); - } - } - - SDLoc DL(N); - SDValue RegClass = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32); - - if (NumVectorElts == 1) { - return CurDAG->SelectNodeTo(N, AMDGPU::COPY_TO_REGCLASS, EltVT, - N->getOperand(0), RegClass); - } - - assert(NumVectorElts <= 16 && "Vectors with more than 16 elements not " - "supported yet"); - // 16 = Max Num Vector Elements - // 2 = 2 REG_SEQUENCE operands per element (value, subreg index) - // 1 = Vector Register Class - SmallVector RegSeqArgs(NumVectorElts * 2 + 1); - - RegSeqArgs[0] = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32); - bool IsRegSeq = true; - unsigned NOps = N->getNumOperands(); - for (unsigned i = 0; i < NOps; i++) { - // XXX: Why is this here? - if (isa(N->getOperand(i))) { - IsRegSeq = false; - break; - } - RegSeqArgs[1 + (2 * i)] = N->getOperand(i); - RegSeqArgs[1 + (2 * i) + 1] = - CurDAG->getTargetConstant(TRI->getSubRegFromChannel(i), DL, - MVT::i32); - } - - if (NOps != NumVectorElts) { - // Fill in the missing undef elements if this was a scalar_to_vector. - assert(Opc == ISD::SCALAR_TO_VECTOR && NOps < NumVectorElts); - - MachineSDNode *ImpDef = CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, - DL, EltVT); - for (unsigned i = NOps; i < NumVectorElts; ++i) { - RegSeqArgs[1 + (2 * i)] = SDValue(ImpDef, 0); - RegSeqArgs[1 + (2 * i) + 1] = - CurDAG->getTargetConstant(TRI->getSubRegFromChannel(i), DL, MVT::i32); - } - } - - if (!IsRegSeq) - break; - return CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, N->getVTList(), - RegSeqArgs); - } - case ISD::BUILD_PAIR: { - SDValue RC, SubReg0, SubReg1; - if (Subtarget->getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) { - break; - } - SDLoc DL(N); - if (N->getValueType(0) == MVT::i128) { - RC = CurDAG->getTargetConstant(AMDGPU::SReg_128RegClassID, DL, MVT::i32); - SubReg0 = CurDAG->getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32); - SubReg1 = CurDAG->getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32); - } else if (N->getValueType(0) == MVT::i64) { - RC = CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32); - SubReg0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32); - SubReg1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32); - } else { - llvm_unreachable("Unhandled value type for BUILD_PAIR"); - } - const SDValue Ops[] = { RC, N->getOperand(0), SubReg0, - N->getOperand(1), SubReg1 }; - return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, - DL, N->getValueType(0), Ops); - } - - case ISD::Constant: - case ISD::ConstantFP: { - if (Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS || - N->getValueType(0).getSizeInBits() != 64 || isInlineImmediate(N)) - break; - - uint64_t Imm; - if (ConstantFPSDNode *FP = dyn_cast(N)) - Imm = FP->getValueAPF().bitcastToAPInt().getZExtValue(); - else { - ConstantSDNode *C = cast(N); - Imm = C->getZExtValue(); - } - - SDLoc DL(N); - SDNode *Lo = CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, - CurDAG->getConstant(Imm & 0xFFFFFFFF, DL, - MVT::i32)); - SDNode *Hi = CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, - CurDAG->getConstant(Imm >> 32, DL, MVT::i32)); - const SDValue Ops[] = { - CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32), - SDValue(Lo, 0), CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32), - SDValue(Hi, 0), CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32) - }; - - return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL, - N->getValueType(0), Ops); - } - - case ISD::LOAD: { - LoadSDNode *LD = cast(N); - SDLoc SL(N); - EVT VT = N->getValueType(0); - - if (VT != MVT::i64 || LD->getExtensionType() != ISD::NON_EXTLOAD) { - N = glueCopyToM0(N); - break; - } - - // To simplify the TableGen patters, we replace all i64 loads with - // v2i32 loads. Alternatively, we could promote i64 loads to v2i32 - // during DAG legalization, however, so places (ExpandUnalignedLoad) - // in the DAG legalizer assume that if i64 is legal, so doing this - // promotion early can cause problems. - - SDValue NewLoad = CurDAG->getLoad(MVT::v2i32, SDLoc(N), LD->getChain(), - LD->getBasePtr(), LD->getMemOperand()); - SDValue BitCast = CurDAG->getNode(ISD::BITCAST, SL, - MVT::i64, NewLoad); - CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 1), NewLoad.getValue(1)); - CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), BitCast); - SDNode *Load = glueCopyToM0(NewLoad.getNode()); - SelectCode(Load); - N = BitCast.getNode(); - break; - } - - case ISD::STORE: { - // Handle i64 stores here for the same reason mentioned above for loads. - StoreSDNode *ST = cast(N); - SDValue Value = ST->getValue(); - if (Value.getValueType() == MVT::i64 && !ST->isTruncatingStore()) { - - SDValue NewValue = CurDAG->getNode(ISD::BITCAST, SDLoc(N), - MVT::v2i32, Value); - SDValue NewStore = CurDAG->getStore(ST->getChain(), SDLoc(N), NewValue, - ST->getBasePtr(), ST->getMemOperand()); - - CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), NewStore); - - if (NewValue.getOpcode() == ISD::BITCAST) { - Select(NewStore.getNode()); - return SelectCode(NewValue.getNode()); - } - - // getNode() may fold the bitcast if its input was another bitcast. If that - // happens we should only select the new store. - N = NewStore.getNode(); - } - - N = glueCopyToM0(N); - break; - } - - case AMDGPUISD::REGISTER_LOAD: { - if (Subtarget->getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) - break; - SDValue Addr, Offset; - - SDLoc DL(N); - SelectADDRIndirect(N->getOperand(1), Addr, Offset); - const SDValue Ops[] = { - Addr, - Offset, - CurDAG->getTargetConstant(0, DL, MVT::i32), - N->getOperand(0), - }; - return CurDAG->getMachineNode(AMDGPU::SI_RegisterLoad, DL, - CurDAG->getVTList(MVT::i32, MVT::i64, - MVT::Other), - Ops); - } - case AMDGPUISD::REGISTER_STORE: { - if (Subtarget->getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) - break; - SDValue Addr, Offset; - SelectADDRIndirect(N->getOperand(2), Addr, Offset); - SDLoc DL(N); - const SDValue Ops[] = { - N->getOperand(1), - Addr, - Offset, - CurDAG->getTargetConstant(0, DL, MVT::i32), - N->getOperand(0), - }; - return CurDAG->getMachineNode(AMDGPU::SI_RegisterStorePseudo, DL, - CurDAG->getVTList(MVT::Other), - Ops); - } - - case AMDGPUISD::BFE_I32: - case AMDGPUISD::BFE_U32: { - if (Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS) - break; - - // There is a scalar version available, but unlike the vector version which - // has a separate operand for the offset and width, the scalar version packs - // the width and offset into a single operand. Try to move to the scalar - // version if the offsets are constant, so that we can try to keep extended - // loads of kernel arguments in SGPRs. - - // TODO: Technically we could try to pattern match scalar bitshifts of - // dynamic values, but it's probably not useful. - ConstantSDNode *Offset = dyn_cast(N->getOperand(1)); - if (!Offset) - break; - - ConstantSDNode *Width = dyn_cast(N->getOperand(2)); - if (!Width) - break; - - bool Signed = Opc == AMDGPUISD::BFE_I32; - - uint32_t OffsetVal = Offset->getZExtValue(); - uint32_t WidthVal = Width->getZExtValue(); - - return getS_BFE(Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32, SDLoc(N), - N->getOperand(0), OffsetVal, WidthVal); - - } - case AMDGPUISD::DIV_SCALE: { - return SelectDIV_SCALE(N); - } - case ISD::CopyToReg: { - const SITargetLowering& Lowering = - *static_cast(getTargetLowering()); - Lowering.legalizeTargetIndependentNode(N, *CurDAG); - break; - } - case ISD::ADDRSPACECAST: - return SelectAddrSpaceCast(N); - case ISD::AND: - case ISD::SRL: - case ISD::SRA: - if (N->getValueType(0) != MVT::i32 || - Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS) - break; - - return SelectS_BFE(N); - } - - return SelectCode(N); -} - - -bool AMDGPUDAGToDAGISel::checkType(const Value *Ptr, unsigned AS) { - assert(AS != 0 && "Use checkPrivateAddress instead."); - if (!Ptr) - return false; - - return Ptr->getType()->getPointerAddressSpace() == AS; -} - -bool AMDGPUDAGToDAGISel::checkPrivateAddress(const MachineMemOperand *Op) { - if (Op->getPseudoValue()) - return true; - - if (PointerType *PT = dyn_cast(Op->getValue()->getType())) - return PT->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS; - - return false; -} - -bool AMDGPUDAGToDAGISel::isGlobalStore(const StoreSDNode *N) { - return checkType(N->getMemOperand()->getValue(), AMDGPUAS::GLOBAL_ADDRESS); -} - -bool AMDGPUDAGToDAGISel::isPrivateStore(const StoreSDNode *N) { - const Value *MemVal = N->getMemOperand()->getValue(); - return (!checkType(MemVal, AMDGPUAS::LOCAL_ADDRESS) && - !checkType(MemVal, AMDGPUAS::GLOBAL_ADDRESS) && - !checkType(MemVal, AMDGPUAS::REGION_ADDRESS)); -} - -bool AMDGPUDAGToDAGISel::isLocalStore(const StoreSDNode *N) { - return checkType(N->getMemOperand()->getValue(), AMDGPUAS::LOCAL_ADDRESS); -} - -bool AMDGPUDAGToDAGISel::isFlatStore(const StoreSDNode *N) { - return checkType(N->getMemOperand()->getValue(), AMDGPUAS::FLAT_ADDRESS); -} - -bool AMDGPUDAGToDAGISel::isRegionStore(const StoreSDNode *N) { - return checkType(N->getMemOperand()->getValue(), AMDGPUAS::REGION_ADDRESS); -} - -bool AMDGPUDAGToDAGISel::isConstantLoad(const LoadSDNode *N, int CbId) const { - const Value *MemVal = N->getMemOperand()->getValue(); - if (CbId == -1) - return checkType(MemVal, AMDGPUAS::CONSTANT_ADDRESS); - - return checkType(MemVal, AMDGPUAS::CONSTANT_BUFFER_0 + CbId); -} - -bool AMDGPUDAGToDAGISel::isGlobalLoad(const LoadSDNode *N) const { - if (N->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS) - if (Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS || - N->getMemoryVT().bitsLT(MVT::i32)) - return true; - - return checkType(N->getMemOperand()->getValue(), AMDGPUAS::GLOBAL_ADDRESS); -} - -bool AMDGPUDAGToDAGISel::isParamLoad(const LoadSDNode *N) const { - return checkType(N->getMemOperand()->getValue(), AMDGPUAS::PARAM_I_ADDRESS); -} - -bool AMDGPUDAGToDAGISel::isLocalLoad(const LoadSDNode *N) const { - return checkType(N->getMemOperand()->getValue(), AMDGPUAS::LOCAL_ADDRESS); -} - -bool AMDGPUDAGToDAGISel::isFlatLoad(const LoadSDNode *N) const { - return checkType(N->getMemOperand()->getValue(), AMDGPUAS::FLAT_ADDRESS); -} - -bool AMDGPUDAGToDAGISel::isRegionLoad(const LoadSDNode *N) const { - return checkType(N->getMemOperand()->getValue(), AMDGPUAS::REGION_ADDRESS); -} - -bool AMDGPUDAGToDAGISel::isCPLoad(const LoadSDNode *N) const { - MachineMemOperand *MMO = N->getMemOperand(); - if (checkPrivateAddress(N->getMemOperand())) { - if (MMO) { - const PseudoSourceValue *PSV = MMO->getPseudoValue(); - if (PSV && PSV == PseudoSourceValue::getConstantPool()) { - return true; - } - } - } - return false; -} - -bool AMDGPUDAGToDAGISel::isPrivateLoad(const LoadSDNode *N) const { - if (checkPrivateAddress(N->getMemOperand())) { - // Check to make sure we are not a constant pool load or a constant load - // that is marked as a private load - if (isCPLoad(N) || isConstantLoad(N, -1)) { - return false; - } - } - - const Value *MemVal = N->getMemOperand()->getValue(); - if (!checkType(MemVal, AMDGPUAS::LOCAL_ADDRESS) && - !checkType(MemVal, AMDGPUAS::GLOBAL_ADDRESS) && - !checkType(MemVal, AMDGPUAS::FLAT_ADDRESS) && - !checkType(MemVal, AMDGPUAS::REGION_ADDRESS) && - !checkType(MemVal, AMDGPUAS::CONSTANT_ADDRESS) && - !checkType(MemVal, AMDGPUAS::PARAM_D_ADDRESS) && - !checkType(MemVal, AMDGPUAS::PARAM_I_ADDRESS)) { - return true; - } - return false; -} - -const char *AMDGPUDAGToDAGISel::getPassName() const { - return "AMDGPU DAG->DAG Pattern Instruction Selection"; -} - -#ifdef DEBUGTMP -#undef INT64_C -#endif -#undef DEBUGTMP - -//===----------------------------------------------------------------------===// -// Complex Patterns -//===----------------------------------------------------------------------===// - -bool AMDGPUDAGToDAGISel::SelectGlobalValueConstantOffset(SDValue Addr, - SDValue& IntPtr) { - if (ConstantSDNode *Cst = dyn_cast(Addr)) { - IntPtr = CurDAG->getIntPtrConstant(Cst->getZExtValue() / 4, SDLoc(Addr), - true); - return true; - } - return false; -} - -bool AMDGPUDAGToDAGISel::SelectGlobalValueVariableOffset(SDValue Addr, - SDValue& BaseReg, SDValue &Offset) { - if (!isa(Addr)) { - BaseReg = Addr; - Offset = CurDAG->getIntPtrConstant(0, SDLoc(Addr), true); - return true; - } - return false; -} - -bool AMDGPUDAGToDAGISel::SelectADDRVTX_READ(SDValue Addr, SDValue &Base, - SDValue &Offset) { - ConstantSDNode *IMMOffset; - - if (Addr.getOpcode() == ISD::ADD - && (IMMOffset = dyn_cast(Addr.getOperand(1))) - && isInt<16>(IMMOffset->getZExtValue())) { - - Base = Addr.getOperand(0); - Offset = CurDAG->getTargetConstant(IMMOffset->getZExtValue(), SDLoc(Addr), - MVT::i32); - return true; - // If the pointer address is constant, we can move it to the offset field. - } else if ((IMMOffset = dyn_cast(Addr)) - && isInt<16>(IMMOffset->getZExtValue())) { - Base = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), - SDLoc(CurDAG->getEntryNode()), - AMDGPU::ZERO, MVT::i32); - Offset = CurDAG->getTargetConstant(IMMOffset->getZExtValue(), SDLoc(Addr), - MVT::i32); - return true; - } - - // Default case, no offset - Base = Addr; - Offset = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i32); - return true; -} - -bool AMDGPUDAGToDAGISel::SelectADDRIndirect(SDValue Addr, SDValue &Base, - SDValue &Offset) { - ConstantSDNode *C; - SDLoc DL(Addr); - - if ((C = dyn_cast(Addr))) { - Base = CurDAG->getRegister(AMDGPU::INDIRECT_BASE_ADDR, MVT::i32); - Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32); - } else if ((Addr.getOpcode() == ISD::ADD || Addr.getOpcode() == ISD::OR) && - (C = dyn_cast(Addr.getOperand(1)))) { - Base = Addr.getOperand(0); - Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32); - } else { - Base = Addr; - Offset = CurDAG->getTargetConstant(0, DL, MVT::i32); - } - - return true; -} - -SDNode *AMDGPUDAGToDAGISel::SelectADD_SUB_I64(SDNode *N) { - SDLoc DL(N); - SDValue LHS = N->getOperand(0); - SDValue RHS = N->getOperand(1); - - bool IsAdd = (N->getOpcode() == ISD::ADD); - - SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32); - SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32); - - SDNode *Lo0 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, - DL, MVT::i32, LHS, Sub0); - SDNode *Hi0 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, - DL, MVT::i32, LHS, Sub1); - - SDNode *Lo1 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, - DL, MVT::i32, RHS, Sub0); - SDNode *Hi1 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, - DL, MVT::i32, RHS, Sub1); - - SDVTList VTList = CurDAG->getVTList(MVT::i32, MVT::Glue); - SDValue AddLoArgs[] = { SDValue(Lo0, 0), SDValue(Lo1, 0) }; - - - unsigned Opc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32; - unsigned CarryOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32; - - SDNode *AddLo = CurDAG->getMachineNode( Opc, DL, VTList, AddLoArgs); - SDValue Carry(AddLo, 1); - SDNode *AddHi - = CurDAG->getMachineNode(CarryOpc, DL, MVT::i32, - SDValue(Hi0, 0), SDValue(Hi1, 0), Carry); - - SDValue Args[5] = { - CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32), - SDValue(AddLo,0), - Sub0, - SDValue(AddHi,0), - Sub1, - }; - return CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, MVT::i64, Args); -} - -// We need to handle this here because tablegen doesn't support matching -// instructions with multiple outputs. -SDNode *AMDGPUDAGToDAGISel::SelectDIV_SCALE(SDNode *N) { - SDLoc SL(N); - EVT VT = N->getValueType(0); - - assert(VT == MVT::f32 || VT == MVT::f64); - - unsigned Opc - = (VT == MVT::f64) ? AMDGPU::V_DIV_SCALE_F64 : AMDGPU::V_DIV_SCALE_F32; - - // src0_modifiers, src0, src1_modifiers, src1, src2_modifiers, src2, clamp, omod - SDValue Ops[8]; - - SelectVOP3Mods0(N->getOperand(0), Ops[1], Ops[0], Ops[6], Ops[7]); - SelectVOP3Mods(N->getOperand(1), Ops[3], Ops[2]); - SelectVOP3Mods(N->getOperand(2), Ops[5], Ops[4]); - return CurDAG->SelectNodeTo(N, Opc, VT, MVT::i1, Ops); -} - -bool AMDGPUDAGToDAGISel::isDSOffsetLegal(const SDValue &Base, unsigned Offset, - unsigned OffsetBits) const { - if ((OffsetBits == 16 && !isUInt<16>(Offset)) || - (OffsetBits == 8 && !isUInt<8>(Offset))) - return false; - - if (Subtarget->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) - return true; - - // On Southern Islands instruction with a negative base value and an offset - // don't seem to work. - return CurDAG->SignBitIsZero(Base); -} - -bool AMDGPUDAGToDAGISel::SelectDS1Addr1Offset(SDValue Addr, SDValue &Base, - SDValue &Offset) const { - if (CurDAG->isBaseWithConstantOffset(Addr)) { - SDValue N0 = Addr.getOperand(0); - SDValue N1 = Addr.getOperand(1); - ConstantSDNode *C1 = cast(N1); - if (isDSOffsetLegal(N0, C1->getSExtValue(), 16)) { - // (add n0, c0) - Base = N0; - Offset = N1; - return true; - } - } - - SDLoc DL(Addr); - - // If we have a constant address, prefer to put the constant into the - // offset. This can save moves to load the constant address since multiple - // operations can share the zero base address register, and enables merging - // into read2 / write2 instructions. - if (const ConstantSDNode *CAddr = dyn_cast(Addr)) { - if (isUInt<16>(CAddr->getZExtValue())) { - SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32); - MachineSDNode *MovZero = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, - DL, MVT::i32, Zero); - Base = SDValue(MovZero, 0); - Offset = Addr; - return true; - } - } - - // default case - Base = Addr; - Offset = CurDAG->getTargetConstant(0, DL, MVT::i16); - return true; -} - -bool AMDGPUDAGToDAGISel::SelectDS64Bit4ByteAligned(SDValue Addr, SDValue &Base, - SDValue &Offset0, - SDValue &Offset1) const { - SDLoc DL(Addr); - - if (CurDAG->isBaseWithConstantOffset(Addr)) { - SDValue N0 = Addr.getOperand(0); - SDValue N1 = Addr.getOperand(1); - ConstantSDNode *C1 = cast(N1); - unsigned DWordOffset0 = C1->getZExtValue() / 4; - unsigned DWordOffset1 = DWordOffset0 + 1; - // (add n0, c0) - if (isDSOffsetLegal(N0, DWordOffset1, 8)) { - Base = N0; - Offset0 = CurDAG->getTargetConstant(DWordOffset0, DL, MVT::i8); - Offset1 = CurDAG->getTargetConstant(DWordOffset1, DL, MVT::i8); - return true; - } - } - - if (const ConstantSDNode *CAddr = dyn_cast(Addr)) { - unsigned DWordOffset0 = CAddr->getZExtValue() / 4; - unsigned DWordOffset1 = DWordOffset0 + 1; - assert(4 * DWordOffset0 == CAddr->getZExtValue()); - - if (isUInt<8>(DWordOffset0) && isUInt<8>(DWordOffset1)) { - SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32); - MachineSDNode *MovZero - = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, - DL, MVT::i32, Zero); - Base = SDValue(MovZero, 0); - Offset0 = CurDAG->getTargetConstant(DWordOffset0, DL, MVT::i8); - Offset1 = CurDAG->getTargetConstant(DWordOffset1, DL, MVT::i8); - return true; - } - } - - // default case - Base = Addr; - Offset0 = CurDAG->getTargetConstant(0, DL, MVT::i8); - Offset1 = CurDAG->getTargetConstant(1, DL, MVT::i8); - return true; -} - -static bool isLegalMUBUFImmOffset(const ConstantSDNode *Imm) { - return isUInt<12>(Imm->getZExtValue()); -} - -void AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr, - SDValue &VAddr, SDValue &SOffset, - SDValue &Offset, SDValue &Offen, - SDValue &Idxen, SDValue &Addr64, - SDValue &GLC, SDValue &SLC, - SDValue &TFE) const { - SDLoc DL(Addr); - - GLC = CurDAG->getTargetConstant(0, DL, MVT::i1); - SLC = CurDAG->getTargetConstant(0, DL, MVT::i1); - TFE = CurDAG->getTargetConstant(0, DL, MVT::i1); - - Idxen = CurDAG->getTargetConstant(0, DL, MVT::i1); - Offen = CurDAG->getTargetConstant(0, DL, MVT::i1); - Addr64 = CurDAG->getTargetConstant(0, DL, MVT::i1); - SOffset = CurDAG->getTargetConstant(0, DL, MVT::i32); - - if (CurDAG->isBaseWithConstantOffset(Addr)) { - SDValue N0 = Addr.getOperand(0); - SDValue N1 = Addr.getOperand(1); - ConstantSDNode *C1 = cast(N1); - - if (N0.getOpcode() == ISD::ADD) { - // (add (add N2, N3), C1) -> addr64 - SDValue N2 = N0.getOperand(0); - SDValue N3 = N0.getOperand(1); - Addr64 = CurDAG->getTargetConstant(1, DL, MVT::i1); - Ptr = N2; - VAddr = N3; - } else { - - // (add N0, C1) -> offset - VAddr = CurDAG->getTargetConstant(0, DL, MVT::i32); - Ptr = N0; - } - - if (isLegalMUBUFImmOffset(C1)) { - Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16); - return; - } else if (isUInt<32>(C1->getZExtValue())) { - // Illegal offset, store it in soffset. - Offset = CurDAG->getTargetConstant(0, DL, MVT::i16); - SOffset = SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, - CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32)), - 0); - return; - } - } - - if (Addr.getOpcode() == ISD::ADD) { - // (add N0, N1) -> addr64 - SDValue N0 = Addr.getOperand(0); - SDValue N1 = Addr.getOperand(1); - Addr64 = CurDAG->getTargetConstant(1, DL, MVT::i1); - Ptr = N0; - VAddr = N1; - Offset = CurDAG->getTargetConstant(0, DL, MVT::i16); - return; - } - - // default case -> offset - VAddr = CurDAG->getTargetConstant(0, DL, MVT::i32); - Ptr = Addr; - Offset = CurDAG->getTargetConstant(0, DL, MVT::i16); - -} - -bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, - SDValue &VAddr, SDValue &SOffset, - SDValue &Offset, SDValue &GLC, - SDValue &SLC, SDValue &TFE) const { - SDValue Ptr, Offen, Idxen, Addr64; - - SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64, - GLC, SLC, TFE); - - ConstantSDNode *C = cast(Addr64); - if (C->getSExtValue()) { - SDLoc DL(Addr); - - const SITargetLowering& Lowering = - *static_cast(getTargetLowering()); - - SRsrc = SDValue(Lowering.wrapAddr64Rsrc(*CurDAG, DL, Ptr), 0); - return true; - } - - return false; -} - -bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, - SDValue &VAddr, SDValue &SOffset, - SDValue &Offset, - SDValue &SLC) const { - SLC = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i1); - SDValue GLC, TFE; - - return SelectMUBUFAddr64(Addr, SRsrc, VAddr, SOffset, Offset, GLC, SLC, TFE); -} - -bool AMDGPUDAGToDAGISel::SelectMUBUFScratch(SDValue Addr, SDValue &Rsrc, - SDValue &VAddr, SDValue &SOffset, - SDValue &ImmOffset) const { - - SDLoc DL(Addr); - MachineFunction &MF = CurDAG->getMachineFunction(); - const SIRegisterInfo *TRI = - static_cast(Subtarget->getRegisterInfo()); - MachineRegisterInfo &MRI = MF.getRegInfo(); - const SITargetLowering& Lowering = - *static_cast(getTargetLowering()); - - unsigned ScratchOffsetReg = - TRI->getPreloadedValue(MF, SIRegisterInfo::SCRATCH_WAVE_OFFSET); - Lowering.CreateLiveInRegister(*CurDAG, &AMDGPU::SReg_32RegClass, - ScratchOffsetReg, MVT::i32); - SDValue Sym0 = CurDAG->getExternalSymbol("SCRATCH_RSRC_DWORD0", MVT::i32); - SDValue ScratchRsrcDword0 = - SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, Sym0), 0); - - SDValue Sym1 = CurDAG->getExternalSymbol("SCRATCH_RSRC_DWORD1", MVT::i32); - SDValue ScratchRsrcDword1 = - SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, Sym1), 0); - - const SDValue RsrcOps[] = { - CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32), - ScratchRsrcDword0, - CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32), - ScratchRsrcDword1, - CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32), - }; - SDValue ScratchPtr = SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL, - MVT::v2i32, RsrcOps), 0); - Rsrc = SDValue(Lowering.buildScratchRSRC(*CurDAG, DL, ScratchPtr), 0); - SOffset = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), DL, - MRI.getLiveInVirtReg(ScratchOffsetReg), MVT::i32); - - // (add n0, c1) - if (CurDAG->isBaseWithConstantOffset(Addr)) { - SDValue N1 = Addr.getOperand(1); - ConstantSDNode *C1 = cast(N1); - - if (isLegalMUBUFImmOffset(C1)) { - VAddr = Addr.getOperand(0); - ImmOffset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16); - return true; - } - } - - // (node) - VAddr = Addr; - ImmOffset = CurDAG->getTargetConstant(0, DL, MVT::i16); - return true; -} - -bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, - SDValue &SOffset, SDValue &Offset, - SDValue &GLC, SDValue &SLC, - SDValue &TFE) const { - SDValue Ptr, VAddr, Offen, Idxen, Addr64; - const SIInstrInfo *TII = - static_cast(Subtarget->getInstrInfo()); - - SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64, - GLC, SLC, TFE); - - if (!cast(Offen)->getSExtValue() && - !cast(Idxen)->getSExtValue() && - !cast(Addr64)->getSExtValue()) { - uint64_t Rsrc = TII->getDefaultRsrcDataFormat() | - APInt::getAllOnesValue(32).getZExtValue(); // Size - SDLoc DL(Addr); - - const SITargetLowering& Lowering = - *static_cast(getTargetLowering()); - - SRsrc = SDValue(Lowering.buildRSRC(*CurDAG, DL, Ptr, 0, Rsrc), 0); - return true; - } - return false; -} - -bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, - SDValue &Soffset, SDValue &Offset, - SDValue &GLC) const { - SDValue SLC, TFE; - - return SelectMUBUFOffset(Addr, SRsrc, Soffset, Offset, GLC, SLC, TFE); -} - -// FIXME: This is incorrect and only enough to be able to compile. -SDNode *AMDGPUDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) { - AddrSpaceCastSDNode *ASC = cast(N); - SDLoc DL(N); - - assert(Subtarget->hasFlatAddressSpace() && - "addrspacecast only supported with flat address space!"); - - assert((ASC->getSrcAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS && - ASC->getDestAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS) && - "Cannot cast address space to / from constant address!"); - - assert((ASC->getSrcAddressSpace() == AMDGPUAS::FLAT_ADDRESS || - ASC->getDestAddressSpace() == AMDGPUAS::FLAT_ADDRESS) && - "Can only cast to / from flat address space!"); - - // The flat instructions read the address as the index of the VGPR holding the - // address, so casting should just be reinterpreting the base VGPR, so just - // insert trunc / bitcast / zext. - - SDValue Src = ASC->getOperand(0); - EVT DestVT = ASC->getValueType(0); - EVT SrcVT = Src.getValueType(); - - unsigned SrcSize = SrcVT.getSizeInBits(); - unsigned DestSize = DestVT.getSizeInBits(); - - if (SrcSize > DestSize) { - assert(SrcSize == 64 && DestSize == 32); - return CurDAG->getMachineNode( - TargetOpcode::EXTRACT_SUBREG, - DL, - DestVT, - Src, - CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32)); - } - - - if (DestSize > SrcSize) { - assert(SrcSize == 32 && DestSize == 64); - - // FIXME: This is probably wrong, we should never be defining - // a register class with both VGPRs and SGPRs - SDValue RC = CurDAG->getTargetConstant(AMDGPU::VS_64RegClassID, DL, - MVT::i32); - - const SDValue Ops[] = { - RC, - Src, - CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32), - SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, - CurDAG->getConstant(0, DL, MVT::i32)), 0), - CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32) - }; - - return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, - DL, N->getValueType(0), Ops); - } - - assert(SrcSize == 64 && DestSize == 64); - return CurDAG->getNode(ISD::BITCAST, DL, DestVT, Src).getNode(); -} - -SDNode *AMDGPUDAGToDAGISel::getS_BFE(unsigned Opcode, SDLoc DL, SDValue Val, - uint32_t Offset, uint32_t Width) { - // Transformation function, pack the offset and width of a BFE into - // the format expected by the S_BFE_I32 / S_BFE_U32. In the second - // source, bits [5:0] contain the offset and bits [22:16] the width. - uint32_t PackedVal = Offset | (Width << 16); - SDValue PackedConst = CurDAG->getTargetConstant(PackedVal, DL, MVT::i32); - - return CurDAG->getMachineNode(Opcode, DL, MVT::i32, Val, PackedConst); -} - -SDNode *AMDGPUDAGToDAGISel::SelectS_BFEFromShifts(SDNode *N) { - // "(a << b) srl c)" ---> "BFE_U32 a, (c-b), (32-c) - // "(a << b) sra c)" ---> "BFE_I32 a, (c-b), (32-c) - // Predicate: 0 < b <= c < 32 - - const SDValue &Shl = N->getOperand(0); - ConstantSDNode *B = dyn_cast(Shl->getOperand(1)); - ConstantSDNode *C = dyn_cast(N->getOperand(1)); - - if (B && C) { - uint32_t BVal = B->getZExtValue(); - uint32_t CVal = C->getZExtValue(); - - if (0 < BVal && BVal <= CVal && CVal < 32) { - bool Signed = N->getOpcode() == ISD::SRA; - unsigned Opcode = Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32; - - return getS_BFE(Opcode, SDLoc(N), Shl.getOperand(0), - CVal - BVal, 32 - CVal); - } - } - return SelectCode(N); -} - -SDNode *AMDGPUDAGToDAGISel::SelectS_BFE(SDNode *N) { - switch (N->getOpcode()) { - case ISD::AND: - if (N->getOperand(0).getOpcode() == ISD::SRL) { - // "(a srl b) & mask" ---> "BFE_U32 a, b, popcount(mask)" - // Predicate: isMask(mask) - const SDValue &Srl = N->getOperand(0); - ConstantSDNode *Shift = dyn_cast(Srl.getOperand(1)); - ConstantSDNode *Mask = dyn_cast(N->getOperand(1)); - - if (Shift && Mask) { - uint32_t ShiftVal = Shift->getZExtValue(); - uint32_t MaskVal = Mask->getZExtValue(); - - if (isMask_32(MaskVal)) { - uint32_t WidthVal = countPopulation(MaskVal); - - return getS_BFE(AMDGPU::S_BFE_U32, SDLoc(N), Srl.getOperand(0), - ShiftVal, WidthVal); - } - } - } - break; - case ISD::SRL: - if (N->getOperand(0).getOpcode() == ISD::AND) { - // "(a & mask) srl b)" ---> "BFE_U32 a, b, popcount(mask >> b)" - // Predicate: isMask(mask >> b) - const SDValue &And = N->getOperand(0); - ConstantSDNode *Shift = dyn_cast(N->getOperand(1)); - ConstantSDNode *Mask = dyn_cast(And->getOperand(1)); - - if (Shift && Mask) { - uint32_t ShiftVal = Shift->getZExtValue(); - uint32_t MaskVal = Mask->getZExtValue() >> ShiftVal; - - if (isMask_32(MaskVal)) { - uint32_t WidthVal = countPopulation(MaskVal); - - return getS_BFE(AMDGPU::S_BFE_U32, SDLoc(N), And.getOperand(0), - ShiftVal, WidthVal); - } - } - } else if (N->getOperand(0).getOpcode() == ISD::SHL) - return SelectS_BFEFromShifts(N); - break; - case ISD::SRA: - if (N->getOperand(0).getOpcode() == ISD::SHL) - return SelectS_BFEFromShifts(N); - break; - } - - return SelectCode(N); -} - -bool AMDGPUDAGToDAGISel::SelectVOP3Mods(SDValue In, SDValue &Src, - SDValue &SrcMods) const { - - unsigned Mods = 0; - - Src = In; - - if (Src.getOpcode() == ISD::FNEG) { - Mods |= SISrcMods::NEG; - Src = Src.getOperand(0); - } - - if (Src.getOpcode() == ISD::FABS) { - Mods |= SISrcMods::ABS; - Src = Src.getOperand(0); - } - - SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32); - - return true; -} - -bool AMDGPUDAGToDAGISel::SelectVOP3Mods0(SDValue In, SDValue &Src, - SDValue &SrcMods, SDValue &Clamp, - SDValue &Omod) const { - SDLoc DL(In); - // FIXME: Handle Clamp and Omod - Clamp = CurDAG->getTargetConstant(0, DL, MVT::i32); - Omod = CurDAG->getTargetConstant(0, DL, MVT::i32); - - return SelectVOP3Mods(In, Src, SrcMods); -} - -bool AMDGPUDAGToDAGISel::SelectVOP3Mods0Clamp(SDValue In, SDValue &Src, - SDValue &SrcMods, - SDValue &Omod) const { - // FIXME: Handle Omod - Omod = CurDAG->getTargetConstant(0, SDLoc(In), MVT::i32); - - return SelectVOP3Mods(In, Src, SrcMods); -} - -bool AMDGPUDAGToDAGISel::SelectVOP3Mods0Clamp0OMod(SDValue In, SDValue &Src, - SDValue &SrcMods, - SDValue &Clamp, - SDValue &Omod) const { - Clamp = Omod = CurDAG->getTargetConstant(0, SDLoc(In), MVT::i32); - return SelectVOP3Mods(In, Src, SrcMods); -} - -void AMDGPUDAGToDAGISel::PostprocessISelDAG() { - const AMDGPUTargetLowering& Lowering = - *static_cast(getTargetLowering()); - bool IsModified = false; - do { - IsModified = false; - // Go over all selected nodes and try to fold them a bit more - for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(), - E = CurDAG->allnodes_end(); I != E; ++I) { - - SDNode *Node = I; - - MachineSDNode *MachineNode = dyn_cast(I); - if (!MachineNode) - continue; - - SDNode *ResNode = Lowering.PostISelFolding(MachineNode, *CurDAG); - if (ResNode != Node) { - ReplaceUses(Node, ResNode); - IsModified = true; - } - } - CurDAG->RemoveDeadNodes(); - } while (IsModified); -} Index: projects/clang370-import/contrib/llvm/lib/Target/R600/AMDGPUMCInstLower.cpp =================================================================== --- projects/clang370-import/contrib/llvm/lib/Target/R600/AMDGPUMCInstLower.cpp (revision 288077) +++ projects/clang370-import/contrib/llvm/lib/Target/R600/AMDGPUMCInstLower.cpp (nonexistent) @@ -1,154 +0,0 @@ -//===- AMDGPUMCInstLower.cpp - Lower AMDGPU MachineInstr to an MCInst -----===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -/// \file -/// \brief Code to lower AMDGPU MachineInstrs to their corresponding MCInst. -// -//===----------------------------------------------------------------------===// -// - -#include "AMDGPUMCInstLower.h" -#include "AMDGPUAsmPrinter.h" -#include "AMDGPUTargetMachine.h" -#include "InstPrinter/AMDGPUInstPrinter.h" -#include "R600InstrInfo.h" -#include "SIInstrInfo.h" -#include "llvm/CodeGen/MachineBasicBlock.h" -#include "llvm/CodeGen/MachineInstr.h" -#include "llvm/IR/Constants.h" -#include "llvm/IR/Function.h" -#include "llvm/IR/GlobalVariable.h" -#include "llvm/MC/MCCodeEmitter.h" -#include "llvm/MC/MCContext.h" -#include "llvm/MC/MCExpr.h" -#include "llvm/MC/MCInst.h" -#include "llvm/MC/MCObjectStreamer.h" -#include "llvm/MC/MCStreamer.h" -#include "llvm/Support/ErrorHandling.h" -#include "llvm/Support/Format.h" -#include - -using namespace llvm; - -AMDGPUMCInstLower::AMDGPUMCInstLower(MCContext &ctx, const AMDGPUSubtarget &st): - Ctx(ctx), ST(st) -{ } - -void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const { - - int MCOpcode = ST.getInstrInfo()->pseudoToMCOpcode(MI->getOpcode()); - - if (MCOpcode == -1) { - LLVMContext &C = MI->getParent()->getParent()->getFunction()->getContext(); - C.emitError("AMDGPUMCInstLower::lower - Pseudo instruction doesn't have " - "a target-specific version: " + Twine(MI->getOpcode())); - } - - OutMI.setOpcode(MCOpcode); - - for (const MachineOperand &MO : MI->explicit_operands()) { - MCOperand MCOp; - switch (MO.getType()) { - default: - llvm_unreachable("unknown operand type"); - case MachineOperand::MO_Immediate: - MCOp = MCOperand::createImm(MO.getImm()); - break; - case MachineOperand::MO_Register: - MCOp = MCOperand::createReg(MO.getReg()); - break; - case MachineOperand::MO_MachineBasicBlock: - MCOp = MCOperand::createExpr(MCSymbolRefExpr::create( - MO.getMBB()->getSymbol(), Ctx)); - break; - case MachineOperand::MO_GlobalAddress: { - const GlobalValue *GV = MO.getGlobal(); - MCSymbol *Sym = Ctx.getOrCreateSymbol(StringRef(GV->getName())); - MCOp = MCOperand::createExpr(MCSymbolRefExpr::create(Sym, Ctx)); - break; - } - case MachineOperand::MO_TargetIndex: { - assert(MO.getIndex() == AMDGPU::TI_CONSTDATA_START); - MCSymbol *Sym = Ctx.getOrCreateSymbol(StringRef(END_OF_TEXT_LABEL_NAME)); - const MCSymbolRefExpr *Expr = MCSymbolRefExpr::create(Sym, Ctx); - MCOp = MCOperand::createExpr(Expr); - break; - } - case MachineOperand::MO_ExternalSymbol: { - MCSymbol *Sym = Ctx.getOrCreateSymbol(StringRef(MO.getSymbolName())); - const MCSymbolRefExpr *Expr = MCSymbolRefExpr::create(Sym, Ctx); - MCOp = MCOperand::createExpr(Expr); - break; - } - } - OutMI.addOperand(MCOp); - } -} - -void AMDGPUAsmPrinter::EmitInstruction(const MachineInstr *MI) { - const AMDGPUSubtarget &STI = MF->getSubtarget(); - AMDGPUMCInstLower MCInstLowering(OutContext, STI); - -#ifdef _DEBUG - StringRef Err; - if (!STI.getInstrInfo()->verifyInstruction(MI, Err)) { - errs() << "Warning: Illegal instruction detected: " << Err << "\n"; - MI->dump(); - } -#endif - if (MI->isBundle()) { - const MachineBasicBlock *MBB = MI->getParent(); - MachineBasicBlock::const_instr_iterator I = MI; - ++I; - while (I != MBB->end() && I->isInsideBundle()) { - EmitInstruction(I); - ++I; - } - } else { - MCInst TmpInst; - MCInstLowering.lower(MI, TmpInst); - EmitToStreamer(*OutStreamer, TmpInst); - - if (STI.dumpCode()) { - // Disassemble instruction/operands to text. - DisasmLines.resize(DisasmLines.size() + 1); - std::string &DisasmLine = DisasmLines.back(); - raw_string_ostream DisasmStream(DisasmLine); - - AMDGPUInstPrinter InstPrinter(*TM.getMCAsmInfo(), - *MF->getSubtarget().getInstrInfo(), - *MF->getSubtarget().getRegisterInfo()); - InstPrinter.printInst(&TmpInst, DisasmStream, StringRef(), - MF->getSubtarget()); - - // Disassemble instruction/operands to hex representation. - SmallVector Fixups; - SmallVector CodeBytes; - raw_svector_ostream CodeStream(CodeBytes); - - auto &ObjStreamer = static_cast(*OutStreamer); - MCCodeEmitter &InstEmitter = ObjStreamer.getAssembler().getEmitter(); - InstEmitter.encodeInstruction(TmpInst, CodeStream, Fixups, - MF->getSubtarget()); - CodeStream.flush(); - - HexLines.resize(HexLines.size() + 1); - std::string &HexLine = HexLines.back(); - raw_string_ostream HexStream(HexLine); - - for (size_t i = 0; i < CodeBytes.size(); i += 4) { - unsigned int CodeDWord = *(unsigned int *)&CodeBytes[i]; - HexStream << format("%s%08X", (i > 0 ? " " : ""), CodeDWord); - } - - DisasmStream.flush(); - DisasmLineMaxLen = std::max(DisasmLineMaxLen, DisasmLine.size()); - } - } -} Index: projects/clang370-import/contrib/llvm/lib/Target/R600/R600MachineFunctionInfo.h =================================================================== --- projects/clang370-import/contrib/llvm/lib/Target/R600/R600MachineFunctionInfo.h (revision 288077) +++ projects/clang370-import/contrib/llvm/lib/Target/R600/R600MachineFunctionInfo.h (nonexistent) @@ -1,34 +0,0 @@ -//===-- R600MachineFunctionInfo.h - R600 Machine Function Info ----*- C++ -*-=// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -/// \file -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIB_TARGET_R600_R600MACHINEFUNCTIONINFO_H -#define LLVM_LIB_TARGET_R600_R600MACHINEFUNCTIONINFO_H - -#include "AMDGPUMachineFunction.h" -#include "llvm/ADT/BitVector.h" -#include "llvm/CodeGen/SelectionDAG.h" -#include - -namespace llvm { - -class R600MachineFunctionInfo : public AMDGPUMachineFunction { - void anchor() override; -public: - R600MachineFunctionInfo(const MachineFunction &MF); - SmallVector LiveOuts; - std::vector IndirectRegs; - unsigned StackSize; -}; - -} // End llvm namespace - -#endif Index: projects/clang370-import/contrib/llvm/lib/Target/R600/R600RegisterInfo.td =================================================================== --- projects/clang370-import/contrib/llvm/lib/Target/R600/R600RegisterInfo.td (revision 288077) +++ projects/clang370-import/contrib/llvm/lib/Target/R600/R600RegisterInfo.td (nonexistent) @@ -1,252 +0,0 @@ - -class R600Reg encoding> : Register { - let Namespace = "AMDGPU"; - let HWEncoding = encoding; -} - -class R600RegWithChan sel, string chan> : - Register { - - field bits<2> chan_encoding = !if(!eq(chan, "X"), 0, - !if(!eq(chan, "Y"), 1, - !if(!eq(chan, "Z"), 2, - !if(!eq(chan, "W"), 3, 0)))); - let HWEncoding{8-0} = sel; - let HWEncoding{10-9} = chan_encoding; - let Namespace = "AMDGPU"; -} - -class R600Reg_128 subregs, bits<16> encoding> : - RegisterWithSubRegs { - field bits<2> chan_encoding = 0; - let Namespace = "AMDGPU"; - let SubRegIndices = [sub0, sub1, sub2, sub3]; - let HWEncoding{8-0} = encoding{8-0}; - let HWEncoding{10-9} = chan_encoding; -} - -class R600Reg_64 subregs, bits<16> encoding> : - RegisterWithSubRegs { - field bits<2> chan_encoding = 0; - let Namespace = "AMDGPU"; - let SubRegIndices = [sub0, sub1]; - let HWEncoding = encoding; - let HWEncoding{8-0} = encoding{8-0}; - let HWEncoding{10-9} = chan_encoding; -} - -class R600Reg_64Vertical : R600Reg_64 < - "V"#lo#hi#"_"#chan, - [!cast("T"#lo#"_"#chan), !cast("T"#hi#"_"#chan)], - lo ->; - -foreach Index = 0-127 in { - foreach Chan = [ "X", "Y", "Z", "W" ] in { - // 32-bit Temporary Registers - def T#Index#_#Chan : R600RegWithChan <"T"#Index#"."#Chan, Index, Chan>; - - // Indirect addressing offset registers - def Addr#Index#_#Chan : R600RegWithChan <"T("#Index#" + AR.x)."#Chan, - Index, Chan>; - } - // 128-bit Temporary Registers - def T#Index#_XYZW : R600Reg_128 <"T"#Index#"", - [!cast("T"#Index#"_X"), - !cast("T"#Index#"_Y"), - !cast("T"#Index#"_Z"), - !cast("T"#Index#"_W")], - Index>; - - def T#Index#_XY : R600Reg_64 <"T"#Index#"", - [!cast("T"#Index#"_X"), - !cast("T"#Index#"_Y")], - Index>; -} - -foreach Chan = [ "X", "Y", "Z", "W"] in { - - let chan_encoding = !if(!eq(Chan, "X"), 0, - !if(!eq(Chan, "Y"), 1, - !if(!eq(Chan, "Z"), 2, - !if(!eq(Chan, "W"), 3, 0)))) in { - def V0123_#Chan : R600Reg_128 <"V0123_"#Chan, - [!cast("T0_"#Chan), - !cast("T1_"#Chan), - !cast("T2_"#Chan), - !cast("T3_"#Chan)], - 0>; - def V01_#Chan : R600Reg_64Vertical<0, 1, Chan>; - def V23_#Chan : R600Reg_64Vertical<2, 3, Chan>; - } -} - - -// KCACHE_BANK0 -foreach Index = 159-128 in { - foreach Chan = [ "X", "Y", "Z", "W" ] in { - // 32-bit Temporary Registers - def KC0_#Index#_#Chan : R600RegWithChan <"KC0["#!add(Index,-128)#"]."#Chan, Index, Chan>; - } - // 128-bit Temporary Registers - def KC0_#Index#_XYZW : R600Reg_128 <"KC0["#!add(Index, -128)#"].XYZW", - [!cast("KC0_"#Index#"_X"), - !cast("KC0_"#Index#"_Y"), - !cast("KC0_"#Index#"_Z"), - !cast("KC0_"#Index#"_W")], - Index>; -} - -// KCACHE_BANK1 -foreach Index = 191-160 in { - foreach Chan = [ "X", "Y", "Z", "W" ] in { - // 32-bit Temporary Registers - def KC1_#Index#_#Chan : R600RegWithChan <"KC1["#!add(Index,-160)#"]."#Chan, Index, Chan>; - } - // 128-bit Temporary Registers - def KC1_#Index#_XYZW : R600Reg_128 <"KC1["#!add(Index, -160)#"].XYZW", - [!cast("KC1_"#Index#"_X"), - !cast("KC1_"#Index#"_Y"), - !cast("KC1_"#Index#"_Z"), - !cast("KC1_"#Index#"_W")], - Index>; -} - - -// Array Base Register holding input in FS -foreach Index = 448-480 in { - def ArrayBase#Index : R600Reg<"ARRAY_BASE", Index>; -} - - -// Special Registers - -def OQA : R600Reg<"OQA", 219>; -def OQB : R600Reg<"OQB", 220>; -def OQAP : R600Reg<"OQAP", 221>; -def OQBP : R600Reg<"OQAP", 222>; -def LDS_DIRECT_A : R600Reg<"LDS_DIRECT_A", 223>; -def LDS_DIRECT_B : R600Reg<"LDS_DIRECT_B", 224>; -def ZERO : R600Reg<"0.0", 248>; -def ONE : R600Reg<"1.0", 249>; -def NEG_ONE : R600Reg<"-1.0", 249>; -def ONE_INT : R600Reg<"1", 250>; -def HALF : R600Reg<"0.5", 252>; -def NEG_HALF : R600Reg<"-0.5", 252>; -def ALU_LITERAL_X : R600RegWithChan<"literal.x", 253, "X">; -def ALU_LITERAL_Y : R600RegWithChan<"literal.y", 253, "Y">; -def ALU_LITERAL_Z : R600RegWithChan<"literal.z", 253, "Z">; -def ALU_LITERAL_W : R600RegWithChan<"literal.w", 253, "W">; -def PV_X : R600RegWithChan<"PV.X", 254, "X">; -def PV_Y : R600RegWithChan<"PV.Y", 254, "Y">; -def PV_Z : R600RegWithChan<"PV.Z", 254, "Z">; -def PV_W : R600RegWithChan<"PV.W", 254, "W">; -def PS: R600Reg<"PS", 255>; -def PREDICATE_BIT : R600Reg<"PredicateBit", 0>; -def PRED_SEL_OFF: R600Reg<"Pred_sel_off", 0>; -def PRED_SEL_ZERO : R600Reg<"Pred_sel_zero", 2>; -def PRED_SEL_ONE : R600Reg<"Pred_sel_one", 3>; -def AR_X : R600Reg<"AR.x", 0>; - -def R600_ArrayBase : RegisterClass <"AMDGPU", [f32, i32], 32, - (add (sequence "ArrayBase%u", 448, 480))>; -// special registers for ALU src operands -// const buffer reference, SRCx_SEL contains index -def ALU_CONST : R600Reg<"CBuf", 0>; -// interpolation param reference, SRCx_SEL contains index -def ALU_PARAM : R600Reg<"Param", 0>; - -let isAllocatable = 0 in { - -def R600_Addr : RegisterClass <"AMDGPU", [i32], 32, (add (sequence "Addr%u_X", 0, 127))>; - -// We only use Addr_[YZW] for vertical vectors. -// FIXME if we add more vertical vector registers we will need to ad more -// registers to these classes. -def R600_Addr_Y : RegisterClass <"AMDGPU", [i32], 32, (add Addr0_Y)>; -def R600_Addr_Z : RegisterClass <"AMDGPU", [i32], 32, (add Addr0_Z)>; -def R600_Addr_W : RegisterClass <"AMDGPU", [i32], 32, (add Addr0_W)>; - -def R600_LDS_SRC_REG : RegisterClass<"AMDGPU", [i32], 32, - (add OQA, OQB, OQAP, OQBP, LDS_DIRECT_A, LDS_DIRECT_B)>; - -def R600_KC0_X : RegisterClass <"AMDGPU", [f32, i32], 32, - (add (sequence "KC0_%u_X", 128, 159))>; - -def R600_KC0_Y : RegisterClass <"AMDGPU", [f32, i32], 32, - (add (sequence "KC0_%u_Y", 128, 159))>; - -def R600_KC0_Z : RegisterClass <"AMDGPU", [f32, i32], 32, - (add (sequence "KC0_%u_Z", 128, 159))>; - -def R600_KC0_W : RegisterClass <"AMDGPU", [f32, i32], 32, - (add (sequence "KC0_%u_W", 128, 159))>; - -def R600_KC0 : RegisterClass <"AMDGPU", [f32, i32], 32, - (interleave R600_KC0_X, R600_KC0_Y, - R600_KC0_Z, R600_KC0_W)>; - -def R600_KC1_X : RegisterClass <"AMDGPU", [f32, i32], 32, - (add (sequence "KC1_%u_X", 160, 191))>; - -def R600_KC1_Y : RegisterClass <"AMDGPU", [f32, i32], 32, - (add (sequence "KC1_%u_Y", 160, 191))>; - -def R600_KC1_Z : RegisterClass <"AMDGPU", [f32, i32], 32, - (add (sequence "KC1_%u_Z", 160, 191))>; - -def R600_KC1_W : RegisterClass <"AMDGPU", [f32, i32], 32, - (add (sequence "KC1_%u_W", 160, 191))>; - -def R600_KC1 : RegisterClass <"AMDGPU", [f32, i32], 32, - (interleave R600_KC1_X, R600_KC1_Y, - R600_KC1_Z, R600_KC1_W)>; - -} // End isAllocatable = 0 - -def R600_TReg32_X : RegisterClass <"AMDGPU", [f32, i32], 32, - (add (sequence "T%u_X", 0, 127), AR_X)>; - -def R600_TReg32_Y : RegisterClass <"AMDGPU", [f32, i32], 32, - (add (sequence "T%u_Y", 0, 127))>; - -def R600_TReg32_Z : RegisterClass <"AMDGPU", [f32, i32], 32, - (add (sequence "T%u_Z", 0, 127))>; - -def R600_TReg32_W : RegisterClass <"AMDGPU", [f32, i32], 32, - (add (sequence "T%u_W", 0, 127))>; - -def R600_TReg32 : RegisterClass <"AMDGPU", [f32, i32], 32, - (interleave R600_TReg32_X, R600_TReg32_Y, - R600_TReg32_Z, R600_TReg32_W)>; - -def R600_Reg32 : RegisterClass <"AMDGPU", [f32, i32], 32, (add - R600_TReg32, - R600_ArrayBase, - R600_Addr, - R600_KC0, R600_KC1, - ZERO, HALF, ONE, ONE_INT, PV_X, ALU_LITERAL_X, NEG_ONE, NEG_HALF, - ALU_CONST, ALU_PARAM, OQAP - )>; - -def R600_Predicate : RegisterClass <"AMDGPU", [i32], 32, (add - PRED_SEL_OFF, PRED_SEL_ZERO, PRED_SEL_ONE)>; - -def R600_Predicate_Bit: RegisterClass <"AMDGPU", [i32], 32, (add - PREDICATE_BIT)>; - -def R600_Reg128 : RegisterClass<"AMDGPU", [v4f32, v4i32], 128, - (add (sequence "T%u_XYZW", 0, 127))> { - let CopyCost = -1; -} - -def R600_Reg128Vertical : RegisterClass<"AMDGPU", [v4f32, v4i32], 128, - (add V0123_W, V0123_Z, V0123_Y, V0123_X) ->; - -def R600_Reg64 : RegisterClass<"AMDGPU", [v2f32, v2i32], 64, - (add (sequence "T%u_XY", 0, 63))>; - -def R600_Reg64Vertical : RegisterClass<"AMDGPU", [v2f32, v2i32], 64, - (add V01_X, V01_Y, V01_Z, V01_W, - V23_X, V23_Y, V23_Z, V23_W)>; Index: projects/clang370-import/contrib/llvm/lib/Target/R600/SIInstrFormats.td =================================================================== --- projects/clang370-import/contrib/llvm/lib/Target/R600/SIInstrFormats.td (revision 288077) +++ projects/clang370-import/contrib/llvm/lib/Target/R600/SIInstrFormats.td (nonexistent) @@ -1,671 +0,0 @@ -//===-- SIInstrFormats.td - SI Instruction Encodings ----------------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// SI Instruction format definitions. -// -//===----------------------------------------------------------------------===// - -class InstSI pattern> : - AMDGPUInst, PredicateControl { - - field bits<1> VM_CNT = 0; - field bits<1> EXP_CNT = 0; - field bits<1> LGKM_CNT = 0; - - field bits<1> SALU = 0; - field bits<1> VALU = 0; - - field bits<1> SOP1 = 0; - field bits<1> SOP2 = 0; - field bits<1> SOPC = 0; - field bits<1> SOPK = 0; - field bits<1> SOPP = 0; - - field bits<1> VOP1 = 0; - field bits<1> VOP2 = 0; - field bits<1> VOP3 = 0; - field bits<1> VOPC = 0; - - field bits<1> MUBUF = 0; - field bits<1> MTBUF = 0; - field bits<1> SMRD = 0; - field bits<1> DS = 0; - field bits<1> MIMG = 0; - field bits<1> FLAT = 0; - field bits<1> WQM = 0; - field bits<1> VGPRSpill = 0; - - // These need to be kept in sync with the enum in SIInstrFlags. - let TSFlags{0} = VM_CNT; - let TSFlags{1} = EXP_CNT; - let TSFlags{2} = LGKM_CNT; - - let TSFlags{3} = SALU; - let TSFlags{4} = VALU; - - let TSFlags{5} = SOP1; - let TSFlags{6} = SOP2; - let TSFlags{7} = SOPC; - let TSFlags{8} = SOPK; - let TSFlags{9} = SOPP; - - let TSFlags{10} = VOP1; - let TSFlags{11} = VOP2; - let TSFlags{12} = VOP3; - let TSFlags{13} = VOPC; - - let TSFlags{14} = MUBUF; - let TSFlags{15} = MTBUF; - let TSFlags{16} = SMRD; - let TSFlags{17} = DS; - let TSFlags{18} = MIMG; - let TSFlags{19} = FLAT; - let TSFlags{20} = WQM; - let TSFlags{21} = VGPRSpill; - - // Most instructions require adjustments after selection to satisfy - // operand requirements. - let hasPostISelHook = 1; - let SchedRW = [Write32Bit]; -} - -class Enc32 { - field bits<32> Inst; - int Size = 4; -} - -class Enc64 { - field bits<64> Inst; - int Size = 8; -} - -class VOPDstOperand : RegisterOperand ; -def VOPDstVCC : VOPDstOperand ; - -let Uses = [EXEC] in { - -class VOPAnyCommon pattern> : - InstSI { - - let mayLoad = 0; - let mayStore = 0; - let hasSideEffects = 0; - let UseNamedOperandTable = 1; - let VALU = 1; -} - -class VOPCCommon pattern> : - VOPAnyCommon <(outs VOPDstVCC:$dst), ins, asm, pattern> { - - let DisableEncoding = "$dst"; - let VOPC = 1; - let Size = 4; -} - -class VOP1Common pattern> : - VOPAnyCommon { - - let VOP1 = 1; - let Size = 4; -} - -class VOP2Common pattern> : - VOPAnyCommon { - - let VOP2 = 1; - let Size = 4; -} - -class VOP3Common pattern> : - VOPAnyCommon { - - // Using complex patterns gives VOP3 patterns a very high complexity rating, - // but standalone patterns are almost always prefered, so we need to adjust the - // priority lower. The goal is to use a high number to reduce complexity to - // zero (or less than zero). - let AddedComplexity = -1000; - - let VOP3 = 1; - let VALU = 1; - - let AsmMatchConverter = "cvtVOP3"; - let isCodeGenOnly = 0; - - int Size = 8; -} - -} // End Uses = [EXEC] - -//===----------------------------------------------------------------------===// -// Scalar operations -//===----------------------------------------------------------------------===// - -class SOP1e op> : Enc32 { - bits<7> sdst; - bits<8> ssrc0; - - let Inst{7-0} = ssrc0; - let Inst{15-8} = op; - let Inst{22-16} = sdst; - let Inst{31-23} = 0x17d; //encoding; -} - -class SOP2e op> : Enc32 { - bits<7> sdst; - bits<8> ssrc0; - bits<8> ssrc1; - - let Inst{7-0} = ssrc0; - let Inst{15-8} = ssrc1; - let Inst{22-16} = sdst; - let Inst{29-23} = op; - let Inst{31-30} = 0x2; // encoding -} - -class SOPCe op> : Enc32 { - bits<8> ssrc0; - bits<8> ssrc1; - - let Inst{7-0} = ssrc0; - let Inst{15-8} = ssrc1; - let Inst{22-16} = op; - let Inst{31-23} = 0x17e; -} - -class SOPKe op> : Enc32 { - bits <7> sdst; - bits <16> simm16; - - let Inst{15-0} = simm16; - let Inst{22-16} = sdst; - let Inst{27-23} = op; - let Inst{31-28} = 0xb; //encoding -} - -class SOPK64e op> : Enc64 { - bits <7> sdst = 0; - bits <16> simm16; - bits <32> imm; - - let Inst{15-0} = simm16; - let Inst{22-16} = sdst; - let Inst{27-23} = op; - let Inst{31-28} = 0xb; - - let Inst{63-32} = imm; -} - -class SOPPe op> : Enc32 { - bits <16> simm16; - - let Inst{15-0} = simm16; - let Inst{22-16} = op; - let Inst{31-23} = 0x17f; // encoding -} - -class SMRDe op, bits<1> imm> : Enc32 { - bits<7> sdst; - bits<7> sbase; - bits<8> offset; - - let Inst{7-0} = offset; - let Inst{8} = imm; - let Inst{14-9} = sbase{6-1}; - let Inst{21-15} = sdst; - let Inst{26-22} = op; - let Inst{31-27} = 0x18; //encoding -} - -let SchedRW = [WriteSALU] in { -class SOP1 pattern> : - InstSI { - let mayLoad = 0; - let mayStore = 0; - let hasSideEffects = 0; - let isCodeGenOnly = 0; - let SALU = 1; - let SOP1 = 1; -} - -class SOP2 pattern> : - InstSI { - - let mayLoad = 0; - let mayStore = 0; - let hasSideEffects = 0; - let isCodeGenOnly = 0; - let SALU = 1; - let SOP2 = 1; - - let UseNamedOperandTable = 1; -} - -class SOPC op, dag outs, dag ins, string asm, list pattern> : - InstSI, SOPCe { - - let DisableEncoding = "$dst"; - let mayLoad = 0; - let mayStore = 0; - let hasSideEffects = 0; - let SALU = 1; - let SOPC = 1; - let isCodeGenOnly = 0; - - let UseNamedOperandTable = 1; -} - -class SOPK pattern> : - InstSI { - - let mayLoad = 0; - let mayStore = 0; - let hasSideEffects = 0; - let SALU = 1; - let SOPK = 1; - - let UseNamedOperandTable = 1; -} - -class SOPP op, dag ins, string asm, list pattern = []> : - InstSI <(outs), ins, asm, pattern >, SOPPe { - - let mayLoad = 0; - let mayStore = 0; - let hasSideEffects = 0; - let SALU = 1; - let SOPP = 1; - - let UseNamedOperandTable = 1; -} - -} // let SchedRW = [WriteSALU] - -class SMRD pattern> : - InstSI { - - let LGKM_CNT = 1; - let SMRD = 1; - let mayStore = 0; - let mayLoad = 1; - let hasSideEffects = 0; - let UseNamedOperandTable = 1; - let SchedRW = [WriteSMEM]; -} - -//===----------------------------------------------------------------------===// -// Vector ALU operations -//===----------------------------------------------------------------------===// - -class VOP1e op> : Enc32 { - bits<8> vdst; - bits<9> src0; - - let Inst{8-0} = src0; - let Inst{16-9} = op; - let Inst{24-17} = vdst; - let Inst{31-25} = 0x3f; //encoding -} - -class VOP2e op> : Enc32 { - bits<8> vdst; - bits<9> src0; - bits<8> src1; - - let Inst{8-0} = src0; - let Inst{16-9} = src1; - let Inst{24-17} = vdst; - let Inst{30-25} = op; - let Inst{31} = 0x0; //encoding -} - -class VOP2_MADKe op> : Enc64 { - - bits<8> vdst; - bits<9> src0; - bits<8> vsrc1; - bits<32> src2; - - let Inst{8-0} = src0; - let Inst{16-9} = vsrc1; - let Inst{24-17} = vdst; - let Inst{30-25} = op; - let Inst{31} = 0x0; // encoding - let Inst{63-32} = src2; -} - -class VOP3e op> : Enc64 { - bits<8> vdst; - bits<2> src0_modifiers; - bits<9> src0; - bits<2> src1_modifiers; - bits<9> src1; - bits<2> src2_modifiers; - bits<9> src2; - bits<1> clamp; - bits<2> omod; - - let Inst{7-0} = vdst; - let Inst{8} = src0_modifiers{1}; - let Inst{9} = src1_modifiers{1}; - let Inst{10} = src2_modifiers{1}; - let Inst{11} = clamp; - let Inst{25-17} = op; - let Inst{31-26} = 0x34; //encoding - let Inst{40-32} = src0; - let Inst{49-41} = src1; - let Inst{58-50} = src2; - let Inst{60-59} = omod; - let Inst{61} = src0_modifiers{0}; - let Inst{62} = src1_modifiers{0}; - let Inst{63} = src2_modifiers{0}; -} - -class VOP3be op> : Enc64 { - bits<8> vdst; - bits<2> src0_modifiers; - bits<9> src0; - bits<2> src1_modifiers; - bits<9> src1; - bits<2> src2_modifiers; - bits<9> src2; - bits<7> sdst; - bits<2> omod; - - let Inst{7-0} = vdst; - let Inst{14-8} = sdst; - let Inst{25-17} = op; - let Inst{31-26} = 0x34; //encoding - let Inst{40-32} = src0; - let Inst{49-41} = src1; - let Inst{58-50} = src2; - let Inst{60-59} = omod; - let Inst{61} = src0_modifiers{0}; - let Inst{62} = src1_modifiers{0}; - let Inst{63} = src2_modifiers{0}; -} - -class VOPCe op> : Enc32 { - bits<9> src0; - bits<8> vsrc1; - - let Inst{8-0} = src0; - let Inst{16-9} = vsrc1; - let Inst{24-17} = op; - let Inst{31-25} = 0x3e; -} - -class VINTRPe op> : Enc32 { - bits<8> vdst; - bits<8> vsrc; - bits<2> attrchan; - bits<6> attr; - - let Inst{7-0} = vsrc; - let Inst{9-8} = attrchan; - let Inst{15-10} = attr; - let Inst{17-16} = op; - let Inst{25-18} = vdst; - let Inst{31-26} = 0x32; // encoding -} - -class DSe op> : Enc64 { - bits<8> vdst; - bits<1> gds; - bits<8> addr; - bits<8> data0; - bits<8> data1; - bits<8> offset0; - bits<8> offset1; - - let Inst{7-0} = offset0; - let Inst{15-8} = offset1; - let Inst{17} = gds; - let Inst{25-18} = op; - let Inst{31-26} = 0x36; //encoding - let Inst{39-32} = addr; - let Inst{47-40} = data0; - let Inst{55-48} = data1; - let Inst{63-56} = vdst; -} - -class MUBUFe op> : Enc64 { - bits<12> offset; - bits<1> offen; - bits<1> idxen; - bits<1> glc; - bits<1> addr64; - bits<1> lds; - bits<8> vaddr; - bits<8> vdata; - bits<7> srsrc; - bits<1> slc; - bits<1> tfe; - bits<8> soffset; - - let Inst{11-0} = offset; - let Inst{12} = offen; - let Inst{13} = idxen; - let Inst{14} = glc; - let Inst{15} = addr64; - let Inst{16} = lds; - let Inst{24-18} = op; - let Inst{31-26} = 0x38; //encoding - let Inst{39-32} = vaddr; - let Inst{47-40} = vdata; - let Inst{52-48} = srsrc{6-2}; - let Inst{54} = slc; - let Inst{55} = tfe; - let Inst{63-56} = soffset; -} - -class MTBUFe op> : Enc64 { - bits<8> vdata; - bits<12> offset; - bits<1> offen; - bits<1> idxen; - bits<1> glc; - bits<1> addr64; - bits<4> dfmt; - bits<3> nfmt; - bits<8> vaddr; - bits<7> srsrc; - bits<1> slc; - bits<1> tfe; - bits<8> soffset; - - let Inst{11-0} = offset; - let Inst{12} = offen; - let Inst{13} = idxen; - let Inst{14} = glc; - let Inst{15} = addr64; - let Inst{18-16} = op; - let Inst{22-19} = dfmt; - let Inst{25-23} = nfmt; - let Inst{31-26} = 0x3a; //encoding - let Inst{39-32} = vaddr; - let Inst{47-40} = vdata; - let Inst{52-48} = srsrc{6-2}; - let Inst{54} = slc; - let Inst{55} = tfe; - let Inst{63-56} = soffset; -} - -class MIMGe op> : Enc64 { - bits<8> vdata; - bits<4> dmask; - bits<1> unorm; - bits<1> glc; - bits<1> da; - bits<1> r128; - bits<1> tfe; - bits<1> lwe; - bits<1> slc; - bits<8> vaddr; - bits<7> srsrc; - bits<7> ssamp; - - let Inst{11-8} = dmask; - let Inst{12} = unorm; - let Inst{13} = glc; - let Inst{14} = da; - let Inst{15} = r128; - let Inst{16} = tfe; - let Inst{17} = lwe; - let Inst{24-18} = op; - let Inst{25} = slc; - let Inst{31-26} = 0x3c; - let Inst{39-32} = vaddr; - let Inst{47-40} = vdata; - let Inst{52-48} = srsrc{6-2}; - let Inst{57-53} = ssamp{6-2}; -} - -class FLATe op> : Enc64 { - bits<8> addr; - bits<8> data; - bits<8> vdst; - bits<1> slc; - bits<1> glc; - bits<1> tfe; - - // 15-0 is reserved. - let Inst{16} = glc; - let Inst{17} = slc; - let Inst{24-18} = op; - let Inst{31-26} = 0x37; // Encoding. - let Inst{39-32} = addr; - let Inst{47-40} = data; - // 54-48 is reserved. - let Inst{55} = tfe; - let Inst{63-56} = vdst; -} - -class EXPe : Enc64 { - bits<4> en; - bits<6> tgt; - bits<1> compr; - bits<1> done; - bits<1> vm; - bits<8> vsrc0; - bits<8> vsrc1; - bits<8> vsrc2; - bits<8> vsrc3; - - let Inst{3-0} = en; - let Inst{9-4} = tgt; - let Inst{10} = compr; - let Inst{11} = done; - let Inst{12} = vm; - let Inst{31-26} = 0x3e; - let Inst{39-32} = vsrc0; - let Inst{47-40} = vsrc1; - let Inst{55-48} = vsrc2; - let Inst{63-56} = vsrc3; -} - -let Uses = [EXEC] in { - -class VOP1 op, dag outs, dag ins, string asm, list pattern> : - VOP1Common , - VOP1e { - let isCodeGenOnly = 0; -} - -class VOP2 op, dag outs, dag ins, string asm, list pattern> : - VOP2Common , VOP2e { - let isCodeGenOnly = 0; -} - -class VOPC op, dag ins, string asm, list pattern> : - VOPCCommon , VOPCe ; - -class VINTRPCommon pattern> : - InstSI { - let mayLoad = 1; - let mayStore = 0; - let hasSideEffects = 0; -} - -} // End Uses = [EXEC] - -//===----------------------------------------------------------------------===// -// Vector I/O operations -//===----------------------------------------------------------------------===// - -let Uses = [EXEC] in { - -class DS pattern> : - InstSI { - - let LGKM_CNT = 1; - let DS = 1; - let UseNamedOperandTable = 1; - let Uses = [M0]; - - // Most instruction load and store data, so set this as the default. - let mayLoad = 1; - let mayStore = 1; - - let hasSideEffects = 0; - let AsmMatchConverter = "cvtDS"; - let SchedRW = [WriteLDS]; -} - -class MUBUF pattern> : - InstSI { - - let VM_CNT = 1; - let EXP_CNT = 1; - let MUBUF = 1; - - let hasSideEffects = 0; - let UseNamedOperandTable = 1; - let AsmMatchConverter = "cvtMubuf"; - let SchedRW = [WriteVMEM]; -} - -class MTBUF pattern> : - InstSI { - - let VM_CNT = 1; - let EXP_CNT = 1; - let MTBUF = 1; - - let hasSideEffects = 0; - let UseNamedOperandTable = 1; - let SchedRW = [WriteVMEM]; -} - -class FLAT op, dag outs, dag ins, string asm, list pattern> : - InstSI, FLATe { - let FLAT = 1; - // Internally, FLAT instruction are executed as both an LDS and a - // Buffer instruction; so, they increment both VM_CNT and LGKM_CNT - // and are not considered done until both have been decremented. - let VM_CNT = 1; - let LGKM_CNT = 1; - - let Uses = [EXEC, FLAT_SCR]; // M0 - - let UseNamedOperandTable = 1; - let hasSideEffects = 0; -} - -class MIMG op, dag outs, dag ins, string asm, list pattern> : - InstSI , MIMGe { - - let VM_CNT = 1; - let EXP_CNT = 1; - let MIMG = 1; - - let hasSideEffects = 0; // XXX ???? -} - - -} // End Uses = [EXEC] Index: projects/clang370-import/contrib/llvm/lib/Target/R600/AMDGPUISelLowering.h =================================================================== --- projects/clang370-import/contrib/llvm/lib/Target/R600/AMDGPUISelLowering.h (revision 288077) +++ projects/clang370-import/contrib/llvm/lib/Target/R600/AMDGPUISelLowering.h (nonexistent) @@ -1,307 +0,0 @@ -//===-- AMDGPUISelLowering.h - AMDGPU Lowering Interface --------*- C++ -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -/// \file -/// \brief Interface definition of the TargetLowering class that is common -/// to all AMD GPUs. -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIB_TARGET_R600_AMDGPUISELLOWERING_H -#define LLVM_LIB_TARGET_R600_AMDGPUISELLOWERING_H - -#include "llvm/Target/TargetLowering.h" - -namespace llvm { - -class AMDGPUMachineFunction; -class AMDGPUSubtarget; -class MachineRegisterInfo; - -class AMDGPUTargetLowering : public TargetLowering { -protected: - const AMDGPUSubtarget *Subtarget; - -private: - SDValue LowerConstantInitializer(const Constant* Init, const GlobalValue *GV, - const SDValue &InitPtr, - SDValue Chain, - SelectionDAG &DAG) const; - SDValue LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const; - /// \brief Lower vector stores by merging the vector elements into an integer - /// of the same bitwidth. - SDValue MergeVectorStore(const SDValue &Op, SelectionDAG &DAG) const; - /// \brief Split a vector store into multiple scalar stores. - /// \returns The resulting chain. - - SDValue LowerFREM(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerFCEIL(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerFRINT(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerFNEARBYINT(SDValue Op, SelectionDAG &DAG) const; - - SDValue LowerFROUND32(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerFROUND64(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const; - - SDValue LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG, bool Signed) const; - SDValue LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) const; - - SDValue LowerFP64_TO_INT(SDValue Op, SelectionDAG &DAG, bool Signed) const; - SDValue LowerFP_TO_UINT(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) const; - - SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const; - - SDValue performStoreCombine(SDNode *N, DAGCombinerInfo &DCI) const; - SDValue performMulCombine(SDNode *N, DAGCombinerInfo &DCI) const; - -protected: - static EVT getEquivalentMemType(LLVMContext &Context, EVT VT); - static EVT getEquivalentLoadRegType(LLVMContext &Context, EVT VT); - - virtual SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, - SelectionDAG &DAG) const; - - /// \brief Split a vector load into a scalar load of each component. - SDValue ScalarizeVectorLoad(SDValue Op, SelectionDAG &DAG) const; - - /// \brief Split a vector load into 2 loads of half the vector. - SDValue SplitVectorLoad(SDValue Op, SelectionDAG &DAG) const; - - /// \brief Split a vector store into a scalar store of each component. - SDValue ScalarizeVectorStore(SDValue Op, SelectionDAG &DAG) const; - - /// \brief Split a vector store into 2 stores of half the vector. - SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const; - - SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerSDIVREM(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerUDIVREM(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerDIVREM24(SDValue Op, SelectionDAG &DAG, bool sign) const; - void LowerUDIVREM64(SDValue Op, SelectionDAG &DAG, - SmallVectorImpl &Results) const; - bool isHWTrueValue(SDValue Op) const; - bool isHWFalseValue(SDValue Op) const; - - /// The SelectionDAGBuilder will automatically promote function arguments - /// with illegal types. However, this does not work for the AMDGPU targets - /// since the function arguments are stored in memory as these illegal types. - /// In order to handle this properly we need to get the origianl types sizes - /// from the LLVM IR Function and fixup the ISD:InputArg values before - /// passing them to AnalyzeFormalArguments() - void getOriginalFunctionArgs(SelectionDAG &DAG, - const Function *F, - const SmallVectorImpl &Ins, - SmallVectorImpl &OrigIns) const; - void AnalyzeFormalArguments(CCState &State, - const SmallVectorImpl &Ins) const; - -public: - AMDGPUTargetLowering(TargetMachine &TM, const AMDGPUSubtarget &STI); - - bool isFAbsFree(EVT VT) const override; - bool isFNegFree(EVT VT) const override; - bool isTruncateFree(EVT Src, EVT Dest) const override; - bool isTruncateFree(Type *Src, Type *Dest) const override; - - bool isZExtFree(Type *Src, Type *Dest) const override; - bool isZExtFree(EVT Src, EVT Dest) const override; - bool isZExtFree(SDValue Val, EVT VT2) const override; - - bool isNarrowingProfitable(EVT VT1, EVT VT2) const override; - - MVT getVectorIdxTy() const override; - bool isSelectSupported(SelectSupportKind) const override; - - bool isFPImmLegal(const APFloat &Imm, EVT VT) const override; - bool ShouldShrinkFPConstant(EVT VT) const override; - bool shouldReduceLoadWidth(SDNode *Load, - ISD::LoadExtType ExtType, - EVT ExtVT) const override; - - bool isLoadBitCastBeneficial(EVT, EVT) const override; - - bool storeOfVectorConstantIsCheap(EVT MemVT, - unsigned NumElem, - unsigned AS) const override; - bool isCheapToSpeculateCttz() const override; - bool isCheapToSpeculateCtlz() const override; - - SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, - bool isVarArg, - const SmallVectorImpl &Outs, - const SmallVectorImpl &OutVals, - SDLoc DL, SelectionDAG &DAG) const override; - SDValue LowerCall(CallLoweringInfo &CLI, - SmallVectorImpl &InVals) const override; - - SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override; - SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override; - void ReplaceNodeResults(SDNode * N, - SmallVectorImpl &Results, - SelectionDAG &DAG) const override; - - SDValue LowerIntrinsicIABS(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerIntrinsicLRP(SDValue Op, SelectionDAG &DAG) const; - SDValue CombineFMinMaxLegacy(SDLoc DL, - EVT VT, - SDValue LHS, - SDValue RHS, - SDValue True, - SDValue False, - SDValue CC, - DAGCombinerInfo &DCI) const; - SDValue CombineIMinMax(SDLoc DL, - EVT VT, - SDValue LHS, - SDValue RHS, - SDValue True, - SDValue False, - SDValue CC, - SelectionDAG &DAG) const; - - const char* getTargetNodeName(unsigned Opcode) const override; - - SDValue getRsqrtEstimate(SDValue Operand, - DAGCombinerInfo &DCI, - unsigned &RefinementSteps, - bool &UseOneConstNR) const override; - SDValue getRecipEstimate(SDValue Operand, - DAGCombinerInfo &DCI, - unsigned &RefinementSteps) const override; - - virtual SDNode *PostISelFolding(MachineSDNode *N, - SelectionDAG &DAG) const { - return N; - } - - /// \brief Determine which of the bits specified in \p Mask are known to be - /// either zero or one and return them in the \p KnownZero and \p KnownOne - /// bitsets. - void computeKnownBitsForTargetNode(const SDValue Op, - APInt &KnownZero, - APInt &KnownOne, - const SelectionDAG &DAG, - unsigned Depth = 0) const override; - - unsigned ComputeNumSignBitsForTargetNode(SDValue Op, const SelectionDAG &DAG, - unsigned Depth = 0) const override; - - /// \brief Helper function that adds Reg to the LiveIn list of the DAG's - /// MachineFunction. - /// - /// \returns a RegisterSDNode representing Reg. - virtual SDValue CreateLiveInRegister(SelectionDAG &DAG, - const TargetRegisterClass *RC, - unsigned Reg, EVT VT) const; -}; - -namespace AMDGPUISD { - -enum NodeType : unsigned { - // AMDIL ISD Opcodes - FIRST_NUMBER = ISD::BUILTIN_OP_END, - CALL, // Function call based on a single integer - UMUL, // 32bit unsigned multiplication - RET_FLAG, - BRANCH_COND, - // End AMDIL ISD Opcodes - DWORDADDR, - FRACT, - CLAMP, - - // SIN_HW, COS_HW - f32 for SI, 1 ULP max error, valid from -100 pi to 100 pi. - // Denormals handled on some parts. - COS_HW, - SIN_HW, - FMAX_LEGACY, - FMIN_LEGACY, - FMAX3, - SMAX3, - UMAX3, - FMIN3, - SMIN3, - UMIN3, - URECIP, - DIV_SCALE, - DIV_FMAS, - DIV_FIXUP, - TRIG_PREOP, // 1 ULP max error for f64 - - // RCP, RSQ - For f32, 1 ULP max error, no denormal handling. - // For f64, max error 2^29 ULP, handles denormals. - RCP, - RSQ, - RSQ_LEGACY, - RSQ_CLAMPED, - LDEXP, - FP_CLASS, - DOT4, - CARRY, - BORROW, - BFE_U32, // Extract range of bits with zero extension to 32-bits. - BFE_I32, // Extract range of bits with sign extension to 32-bits. - BFI, // (src0 & src1) | (~src0 & src2) - BFM, // Insert a range of bits into a 32-bit word. - BREV, // Reverse bits. - MUL_U24, - MUL_I24, - MAD_U24, - MAD_I24, - TEXTURE_FETCH, - EXPORT, - CONST_ADDRESS, - REGISTER_LOAD, - REGISTER_STORE, - LOAD_INPUT, - SAMPLE, - SAMPLEB, - SAMPLED, - SAMPLEL, - - // These cvt_f32_ubyte* nodes need to remain consecutive and in order. - CVT_F32_UBYTE0, - CVT_F32_UBYTE1, - CVT_F32_UBYTE2, - CVT_F32_UBYTE3, - /// This node is for VLIW targets and it is used to represent a vector - /// that is stored in consecutive registers with the same channel. - /// For example: - /// |X |Y|Z|W| - /// T0|v.x| | | | - /// T1|v.y| | | | - /// T2|v.z| | | | - /// T3|v.w| | | | - BUILD_VERTICAL_VECTOR, - /// Pointer to the start of the shader's constant data. - CONST_DATA_PTR, - SENDMSG, - INTERP_MOV, - INTERP_P1, - INTERP_P2, - FIRST_MEM_OPCODE_NUMBER = ISD::FIRST_TARGET_MEMORY_OPCODE, - STORE_MSKOR, - LOAD_CONSTANT, - TBUFFER_STORE_FORMAT, - LAST_AMDGPU_ISD_NUMBER -}; - - -} // End namespace AMDGPUISD - -} // End namespace llvm - -#endif Index: projects/clang370-import/contrib/llvm/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp =================================================================== --- projects/clang370-import/contrib/llvm/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp (revision 288077) +++ projects/clang370-import/contrib/llvm/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp (nonexistent) @@ -1,181 +0,0 @@ -//===- R600MCCodeEmitter.cpp - Code Emitter for R600->Cayman GPU families -===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -/// \file -/// -/// \brief The R600 code emitter produces machine code that can be executed -/// directly on the GPU device. -// -//===----------------------------------------------------------------------===// - -#include "R600Defines.h" -#include "MCTargetDesc/AMDGPUMCCodeEmitter.h" -#include "MCTargetDesc/AMDGPUMCTargetDesc.h" -#include "llvm/MC/MCCodeEmitter.h" -#include "llvm/MC/MCContext.h" -#include "llvm/MC/MCInst.h" -#include "llvm/MC/MCInstrInfo.h" -#include "llvm/MC/MCRegisterInfo.h" -#include "llvm/MC/MCSubtargetInfo.h" -#include "llvm/Support/EndianStream.h" -#include "llvm/Support/raw_ostream.h" - -using namespace llvm; - -namespace { - -class R600MCCodeEmitter : public AMDGPUMCCodeEmitter { - R600MCCodeEmitter(const R600MCCodeEmitter &) = delete; - void operator=(const R600MCCodeEmitter &) = delete; - const MCInstrInfo &MCII; - const MCRegisterInfo &MRI; - -public: - - R600MCCodeEmitter(const MCInstrInfo &mcii, const MCRegisterInfo &mri) - : MCII(mcii), MRI(mri) { } - - /// \brief Encode the instruction and write it to the OS. - void encodeInstruction(const MCInst &MI, raw_ostream &OS, - SmallVectorImpl &Fixups, - const MCSubtargetInfo &STI) const override; - - /// \returns the encoding for an MCOperand. - uint64_t getMachineOpValue(const MCInst &MI, const MCOperand &MO, - SmallVectorImpl &Fixups, - const MCSubtargetInfo &STI) const override; -private: - - void EmitByte(unsigned int byte, raw_ostream &OS) const; - - void Emit(uint32_t value, raw_ostream &OS) const; - void Emit(uint64_t value, raw_ostream &OS) const; - - unsigned getHWRegChan(unsigned reg) const; - unsigned getHWReg(unsigned regNo) const; - -}; - -} // End anonymous namespace - -enum RegElement { - ELEMENT_X = 0, - ELEMENT_Y, - ELEMENT_Z, - ELEMENT_W -}; - -enum FCInstr { - FC_IF_PREDICATE = 0, - FC_ELSE, - FC_ENDIF, - FC_BGNLOOP, - FC_ENDLOOP, - FC_BREAK_PREDICATE, - FC_CONTINUE -}; - -MCCodeEmitter *llvm::createR600MCCodeEmitter(const MCInstrInfo &MCII, - const MCRegisterInfo &MRI, - MCContext &Ctx) { - return new R600MCCodeEmitter(MCII, MRI); -} - -void R600MCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS, - SmallVectorImpl &Fixups, - const MCSubtargetInfo &STI) const { - const MCInstrDesc &Desc = MCII.get(MI.getOpcode()); - if (MI.getOpcode() == AMDGPU::RETURN || - MI.getOpcode() == AMDGPU::FETCH_CLAUSE || - MI.getOpcode() == AMDGPU::ALU_CLAUSE || - MI.getOpcode() == AMDGPU::BUNDLE || - MI.getOpcode() == AMDGPU::KILL) { - return; - } else if (IS_VTX(Desc)) { - uint64_t InstWord01 = getBinaryCodeForInstr(MI, Fixups, STI); - uint32_t InstWord2 = MI.getOperand(2).getImm(); // Offset - if (!(STI.getFeatureBits()[AMDGPU::FeatureCaymanISA])) { - InstWord2 |= 1 << 19; // Mega-Fetch bit - } - - Emit(InstWord01, OS); - Emit(InstWord2, OS); - Emit((uint32_t) 0, OS); - } else if (IS_TEX(Desc)) { - int64_t Sampler = MI.getOperand(14).getImm(); - - int64_t SrcSelect[4] = { - MI.getOperand(2).getImm(), - MI.getOperand(3).getImm(), - MI.getOperand(4).getImm(), - MI.getOperand(5).getImm() - }; - int64_t Offsets[3] = { - MI.getOperand(6).getImm() & 0x1F, - MI.getOperand(7).getImm() & 0x1F, - MI.getOperand(8).getImm() & 0x1F - }; - - uint64_t Word01 = getBinaryCodeForInstr(MI, Fixups, STI); - uint32_t Word2 = Sampler << 15 | SrcSelect[ELEMENT_X] << 20 | - SrcSelect[ELEMENT_Y] << 23 | SrcSelect[ELEMENT_Z] << 26 | - SrcSelect[ELEMENT_W] << 29 | Offsets[0] << 0 | Offsets[1] << 5 | - Offsets[2] << 10; - - Emit(Word01, OS); - Emit(Word2, OS); - Emit((uint32_t) 0, OS); - } else { - uint64_t Inst = getBinaryCodeForInstr(MI, Fixups, STI); - if ((STI.getFeatureBits()[AMDGPU::FeatureR600ALUInst]) && - ((Desc.TSFlags & R600_InstFlag::OP1) || - Desc.TSFlags & R600_InstFlag::OP2)) { - uint64_t ISAOpCode = Inst & (0x3FFULL << 39); - Inst &= ~(0x3FFULL << 39); - Inst |= ISAOpCode << 1; - } - Emit(Inst, OS); - } -} - -void R600MCCodeEmitter::EmitByte(unsigned int Byte, raw_ostream &OS) const { - OS.write((uint8_t) Byte & 0xff); -} - -void R600MCCodeEmitter::Emit(uint32_t Value, raw_ostream &OS) const { - support::endian::Writer(OS).write(Value); -} - -void R600MCCodeEmitter::Emit(uint64_t Value, raw_ostream &OS) const { - support::endian::Writer(OS).write(Value); -} - -unsigned R600MCCodeEmitter::getHWRegChan(unsigned reg) const { - return MRI.getEncodingValue(reg) >> HW_CHAN_SHIFT; -} - -unsigned R600MCCodeEmitter::getHWReg(unsigned RegNo) const { - return MRI.getEncodingValue(RegNo) & HW_REG_MASK; -} - -uint64_t R600MCCodeEmitter::getMachineOpValue(const MCInst &MI, - const MCOperand &MO, - SmallVectorImpl &Fixup, - const MCSubtargetInfo &STI) const { - if (MO.isReg()) { - if (HAS_NATIVE_OPERANDS(MCII.get(MI.getOpcode()).TSFlags)) - return MRI.getEncodingValue(MO.getReg()); - return getHWReg(MO.getReg()); - } - - assert(MO.isImm()); - return MO.getImm(); -} - -#include "AMDGPUGenMCCodeEmitter.inc" Index: projects/clang370-import/contrib/llvm/lib/Target/R600/MCTargetDesc/AMDGPUFixupKinds.h =================================================================== --- projects/clang370-import/contrib/llvm/lib/Target/R600/MCTargetDesc/AMDGPUFixupKinds.h (revision 288077) +++ projects/clang370-import/contrib/llvm/lib/Target/R600/MCTargetDesc/AMDGPUFixupKinds.h (nonexistent) @@ -1,34 +0,0 @@ -//===-- AMDGPUFixupKinds.h - AMDGPU Specific Fixup Entries ------*- C++ -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIB_TARGET_R600_MCTARGETDESC_AMDGPUFIXUPKINDS_H -#define LLVM_LIB_TARGET_R600_MCTARGETDESC_AMDGPUFIXUPKINDS_H - -#include "llvm/MC/MCFixup.h" - -namespace llvm { -namespace AMDGPU { -enum Fixups { - /// 16-bit PC relative fixup for SOPP branch instructions. - fixup_si_sopp_br = FirstTargetFixupKind, - - /// fixup for global addresses with constant initializers - fixup_si_rodata, - - /// fixup for offset from instruction to end of text section - fixup_si_end_of_text, - - // Marker - LastTargetFixupKind, - NumTargetFixupKinds = LastTargetFixupKind - FirstTargetFixupKind -}; -} -} - -#endif Index: projects/clang370-import/contrib/llvm/lib/Target/R600/MCTargetDesc/AMDGPUMCCodeEmitter.cpp =================================================================== --- projects/clang370-import/contrib/llvm/lib/Target/R600/MCTargetDesc/AMDGPUMCCodeEmitter.cpp (revision 288077) +++ projects/clang370-import/contrib/llvm/lib/Target/R600/MCTargetDesc/AMDGPUMCCodeEmitter.cpp (nonexistent) @@ -1,21 +0,0 @@ -//===-- AMDGPUCodeEmitter.cpp - AMDGPU Code Emitter interface -------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -/// \file -/// \brief CodeEmitter interface for R600 and SI codegen. -// -//===----------------------------------------------------------------------===// - -#include "AMDGPUMCCodeEmitter.h" - -using namespace llvm; - -// pin vtable to this file -void AMDGPUMCCodeEmitter::anchor() {} - Index: projects/clang370-import/contrib/llvm/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.cpp =================================================================== --- projects/clang370-import/contrib/llvm/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.cpp (revision 288077) +++ projects/clang370-import/contrib/llvm/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.cpp (nonexistent) @@ -1,90 +0,0 @@ -//===-- AMDGPUMCTargetDesc.cpp - AMDGPU Target Descriptions ---------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -/// \file -/// \brief This file provides AMDGPU specific target descriptions. -// -//===----------------------------------------------------------------------===// - -#include "AMDGPUMCTargetDesc.h" -#include "AMDGPUMCAsmInfo.h" -#include "InstPrinter/AMDGPUInstPrinter.h" -#include "SIDefines.h" -#include "llvm/MC/MCCodeGenInfo.h" -#include "llvm/MC/MCContext.h" -#include "llvm/MC/MCInstrInfo.h" -#include "llvm/MC/MCRegisterInfo.h" -#include "llvm/MC/MCStreamer.h" -#include "llvm/MC/MCSubtargetInfo.h" -#include "llvm/MC/MachineLocation.h" -#include "llvm/Support/ErrorHandling.h" -#include "llvm/Support/TargetRegistry.h" - -using namespace llvm; - -#define GET_INSTRINFO_MC_DESC -#include "AMDGPUGenInstrInfo.inc" - -#define GET_SUBTARGETINFO_MC_DESC -#include "AMDGPUGenSubtargetInfo.inc" - -#define GET_REGINFO_MC_DESC -#include "AMDGPUGenRegisterInfo.inc" - -static MCInstrInfo *createAMDGPUMCInstrInfo() { - MCInstrInfo *X = new MCInstrInfo(); - InitAMDGPUMCInstrInfo(X); - return X; -} - -static MCRegisterInfo *createAMDGPUMCRegisterInfo(StringRef TT) { - MCRegisterInfo *X = new MCRegisterInfo(); - InitAMDGPUMCRegisterInfo(X, 0); - return X; -} - -static MCSubtargetInfo *createAMDGPUMCSubtargetInfo(StringRef TT, StringRef CPU, - StringRef FS) { - MCSubtargetInfo * X = new MCSubtargetInfo(); - InitAMDGPUMCSubtargetInfo(X, TT, CPU, FS); - return X; -} - -static MCCodeGenInfo *createAMDGPUMCCodeGenInfo(StringRef TT, Reloc::Model RM, - CodeModel::Model CM, - CodeGenOpt::Level OL) { - MCCodeGenInfo *X = new MCCodeGenInfo(); - X->initMCCodeGenInfo(RM, CM, OL); - return X; -} - -static MCInstPrinter *createAMDGPUMCInstPrinter(const Triple &T, - unsigned SyntaxVariant, - const MCAsmInfo &MAI, - const MCInstrInfo &MII, - const MCRegisterInfo &MRI) { - return new AMDGPUInstPrinter(MAI, MII, MRI); -} - -extern "C" void LLVMInitializeR600TargetMC() { - for (Target *T : {&TheAMDGPUTarget, &TheGCNTarget}) { - RegisterMCAsmInfo X(*T); - - TargetRegistry::RegisterMCCodeGenInfo(*T, createAMDGPUMCCodeGenInfo); - TargetRegistry::RegisterMCInstrInfo(*T, createAMDGPUMCInstrInfo); - TargetRegistry::RegisterMCRegInfo(*T, createAMDGPUMCRegisterInfo); - TargetRegistry::RegisterMCSubtargetInfo(*T, createAMDGPUMCSubtargetInfo); - TargetRegistry::RegisterMCInstPrinter(*T, createAMDGPUMCInstPrinter); - TargetRegistry::RegisterMCAsmBackend(*T, createAMDGPUAsmBackend); - } - - TargetRegistry::RegisterMCCodeEmitter(TheAMDGPUTarget, - createR600MCCodeEmitter); - TargetRegistry::RegisterMCCodeEmitter(TheGCNTarget, createSIMCCodeEmitter); -} Index: projects/clang370-import/contrib/llvm/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.h =================================================================== --- projects/clang370-import/contrib/llvm/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.h (revision 288077) +++ projects/clang370-import/contrib/llvm/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.h (nonexistent) @@ -1,60 +0,0 @@ -//===-- AMDGPUMCTargetDesc.h - AMDGPU Target Descriptions -----*- C++ -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -/// \file -/// \brief Provides AMDGPU specific target descriptions. -// -//===----------------------------------------------------------------------===// -// - -#ifndef LLVM_LIB_TARGET_R600_MCTARGETDESC_AMDGPUMCTARGETDESC_H -#define LLVM_LIB_TARGET_R600_MCTARGETDESC_AMDGPUMCTARGETDESC_H - -#include "llvm/Support/DataTypes.h" -#include "llvm/ADT/StringRef.h" - -namespace llvm { -class MCAsmBackend; -class MCCodeEmitter; -class MCContext; -class MCInstrInfo; -class MCObjectWriter; -class MCRegisterInfo; -class MCSubtargetInfo; -class Target; -class raw_pwrite_stream; -class raw_ostream; - -extern Target TheAMDGPUTarget; -extern Target TheGCNTarget; - -MCCodeEmitter *createR600MCCodeEmitter(const MCInstrInfo &MCII, - const MCRegisterInfo &MRI, - MCContext &Ctx); - -MCCodeEmitter *createSIMCCodeEmitter(const MCInstrInfo &MCII, - const MCRegisterInfo &MRI, - MCContext &Ctx); - -MCAsmBackend *createAMDGPUAsmBackend(const Target &T, const MCRegisterInfo &MRI, - StringRef TT, StringRef CPU); - -MCObjectWriter *createAMDGPUELFObjectWriter(raw_pwrite_stream &OS); -} // End llvm namespace - -#define GET_REGINFO_ENUM -#include "AMDGPUGenRegisterInfo.inc" - -#define GET_INSTRINFO_ENUM -#include "AMDGPUGenInstrInfo.inc" - -#define GET_SUBTARGETINFO_ENUM -#include "AMDGPUGenSubtargetInfo.inc" - -#endif Index: projects/clang370-import/contrib/llvm/lib/Target/R600/MCTargetDesc/SIMCCodeEmitter.cpp =================================================================== --- projects/clang370-import/contrib/llvm/lib/Target/R600/MCTargetDesc/SIMCCodeEmitter.cpp (revision 288077) +++ projects/clang370-import/contrib/llvm/lib/Target/R600/MCTargetDesc/SIMCCodeEmitter.cpp (nonexistent) @@ -1,289 +0,0 @@ -//===-- SIMCCodeEmitter.cpp - SI Code Emitter -------------------------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -/// \file -/// \brief The SI code emitter produces machine code that can be executed -/// directly on the GPU device. -// -//===----------------------------------------------------------------------===// - -#include "AMDGPU.h" -#include "MCTargetDesc/AMDGPUFixupKinds.h" -#include "MCTargetDesc/AMDGPUMCCodeEmitter.h" -#include "MCTargetDesc/AMDGPUMCTargetDesc.h" -#include "SIDefines.h" -#include "llvm/MC/MCCodeEmitter.h" -#include "llvm/MC/MCContext.h" -#include "llvm/MC/MCFixup.h" -#include "llvm/MC/MCInst.h" -#include "llvm/MC/MCInstrInfo.h" -#include "llvm/MC/MCRegisterInfo.h" -#include "llvm/MC/MCSubtargetInfo.h" -#include "llvm/Support/raw_ostream.h" - -using namespace llvm; - -namespace { - -class SIMCCodeEmitter : public AMDGPUMCCodeEmitter { - SIMCCodeEmitter(const SIMCCodeEmitter &) = delete; - void operator=(const SIMCCodeEmitter &) = delete; - const MCInstrInfo &MCII; - const MCRegisterInfo &MRI; - MCContext &Ctx; - - /// \brief Can this operand also contain immediate values? - bool isSrcOperand(const MCInstrDesc &Desc, unsigned OpNo) const; - - /// \brief Encode an fp or int literal - uint32_t getLitEncoding(const MCOperand &MO, unsigned OpSize) const; - -public: - SIMCCodeEmitter(const MCInstrInfo &mcii, const MCRegisterInfo &mri, - MCContext &ctx) - : MCII(mcii), MRI(mri), Ctx(ctx) { } - - ~SIMCCodeEmitter() override {} - - /// \brief Encode the instruction and write it to the OS. - void encodeInstruction(const MCInst &MI, raw_ostream &OS, - SmallVectorImpl &Fixups, - const MCSubtargetInfo &STI) const override; - - /// \returns the encoding for an MCOperand. - uint64_t getMachineOpValue(const MCInst &MI, const MCOperand &MO, - SmallVectorImpl &Fixups, - const MCSubtargetInfo &STI) const override; - - /// \brief Use a fixup to encode the simm16 field for SOPP branch - /// instructions. - unsigned getSOPPBrEncoding(const MCInst &MI, unsigned OpNo, - SmallVectorImpl &Fixups, - const MCSubtargetInfo &STI) const override; -}; - -} // End anonymous namespace - -MCCodeEmitter *llvm::createSIMCCodeEmitter(const MCInstrInfo &MCII, - const MCRegisterInfo &MRI, - MCContext &Ctx) { - return new SIMCCodeEmitter(MCII, MRI, Ctx); -} - -bool SIMCCodeEmitter::isSrcOperand(const MCInstrDesc &Desc, - unsigned OpNo) const { - unsigned OpType = Desc.OpInfo[OpNo].OperandType; - - return OpType == AMDGPU::OPERAND_REG_IMM32 || - OpType == AMDGPU::OPERAND_REG_INLINE_C; -} - -// Returns the encoding value to use if the given integer is an integer inline -// immediate value, or 0 if it is not. -template -static uint32_t getIntInlineImmEncoding(IntTy Imm) { - if (Imm >= 0 && Imm <= 64) - return 128 + Imm; - - if (Imm >= -16 && Imm <= -1) - return 192 + std::abs(Imm); - - return 0; -} - -static uint32_t getLit32Encoding(uint32_t Val) { - uint32_t IntImm = getIntInlineImmEncoding(static_cast(Val)); - if (IntImm != 0) - return IntImm; - - if (Val == FloatToBits(0.5f)) - return 240; - - if (Val == FloatToBits(-0.5f)) - return 241; - - if (Val == FloatToBits(1.0f)) - return 242; - - if (Val == FloatToBits(-1.0f)) - return 243; - - if (Val == FloatToBits(2.0f)) - return 244; - - if (Val == FloatToBits(-2.0f)) - return 245; - - if (Val == FloatToBits(4.0f)) - return 246; - - if (Val == FloatToBits(-4.0f)) - return 247; - - return 255; -} - -static uint32_t getLit64Encoding(uint64_t Val) { - uint32_t IntImm = getIntInlineImmEncoding(static_cast(Val)); - if (IntImm != 0) - return IntImm; - - if (Val == DoubleToBits(0.5)) - return 240; - - if (Val == DoubleToBits(-0.5)) - return 241; - - if (Val == DoubleToBits(1.0)) - return 242; - - if (Val == DoubleToBits(-1.0)) - return 243; - - if (Val == DoubleToBits(2.0)) - return 244; - - if (Val == DoubleToBits(-2.0)) - return 245; - - if (Val == DoubleToBits(4.0)) - return 246; - - if (Val == DoubleToBits(-4.0)) - return 247; - - return 255; -} - -uint32_t SIMCCodeEmitter::getLitEncoding(const MCOperand &MO, - unsigned OpSize) const { - if (MO.isExpr()) - return 255; - - assert(!MO.isFPImm()); - - if (!MO.isImm()) - return ~0; - - if (OpSize == 4) - return getLit32Encoding(static_cast(MO.getImm())); - - assert(OpSize == 8); - - return getLit64Encoding(static_cast(MO.getImm())); -} - -void SIMCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS, - SmallVectorImpl &Fixups, - const MCSubtargetInfo &STI) const { - - uint64_t Encoding = getBinaryCodeForInstr(MI, Fixups, STI); - const MCInstrDesc &Desc = MCII.get(MI.getOpcode()); - unsigned bytes = Desc.getSize(); - - for (unsigned i = 0; i < bytes; i++) { - OS.write((uint8_t) ((Encoding >> (8 * i)) & 0xff)); - } - - if (bytes > 4) - return; - - // Check for additional literals in SRC0/1/2 (Op 1/2/3) - for (unsigned i = 0, e = MI.getNumOperands(); i < e; ++i) { - - // Check if this operand should be encoded as [SV]Src - if (!isSrcOperand(Desc, i)) - continue; - - int RCID = Desc.OpInfo[i].RegClass; - const MCRegisterClass &RC = MRI.getRegClass(RCID); - - // Is this operand a literal immediate? - const MCOperand &Op = MI.getOperand(i); - if (getLitEncoding(Op, RC.getSize()) != 255) - continue; - - // Yes! Encode it - int64_t Imm = 0; - - if (Op.isImm()) - Imm = Op.getImm(); - else if (!Op.isExpr()) // Exprs will be replaced with a fixup value. - llvm_unreachable("Must be immediate or expr"); - - for (unsigned j = 0; j < 4; j++) { - OS.write((uint8_t) ((Imm >> (8 * j)) & 0xff)); - } - - // Only one literal value allowed - break; - } -} - -unsigned SIMCCodeEmitter::getSOPPBrEncoding(const MCInst &MI, unsigned OpNo, - SmallVectorImpl &Fixups, - const MCSubtargetInfo &STI) const { - const MCOperand &MO = MI.getOperand(OpNo); - - if (MO.isExpr()) { - const MCExpr *Expr = MO.getExpr(); - MCFixupKind Kind = (MCFixupKind)AMDGPU::fixup_si_sopp_br; - Fixups.push_back(MCFixup::create(0, Expr, Kind, MI.getLoc())); - return 0; - } - - return getMachineOpValue(MI, MO, Fixups, STI); -} - -uint64_t SIMCCodeEmitter::getMachineOpValue(const MCInst &MI, - const MCOperand &MO, - SmallVectorImpl &Fixups, - const MCSubtargetInfo &STI) const { - if (MO.isReg()) - return MRI.getEncodingValue(MO.getReg()); - - if (MO.isExpr()) { - const MCSymbolRefExpr *Expr = cast(MO.getExpr()); - MCFixupKind Kind; - const MCSymbol *Sym = - Ctx.getOrCreateSymbol(StringRef(END_OF_TEXT_LABEL_NAME)); - - if (&Expr->getSymbol() == Sym) { - // Add the offset to the beginning of the constant values. - Kind = (MCFixupKind)AMDGPU::fixup_si_end_of_text; - } else { - // This is used for constant data stored in .rodata. - Kind = (MCFixupKind)AMDGPU::fixup_si_rodata; - } - Fixups.push_back(MCFixup::create(4, Expr, Kind, MI.getLoc())); - } - - // Figure out the operand number, needed for isSrcOperand check - unsigned OpNo = 0; - for (unsigned e = MI.getNumOperands(); OpNo < e; ++OpNo) { - if (&MO == &MI.getOperand(OpNo)) - break; - } - - const MCInstrDesc &Desc = MCII.get(MI.getOpcode()); - if (isSrcOperand(Desc, OpNo)) { - int RCID = Desc.OpInfo[OpNo].RegClass; - const MCRegisterClass &RC = MRI.getRegClass(RCID); - - uint32_t Enc = getLitEncoding(MO, RC.getSize()); - if (Enc != ~0U && (Enc != 255 || Desc.getSize() == 4)) - return Enc; - - } else if (MO.isImm()) - return MO.getImm(); - - llvm_unreachable("Encoding of this operand type is not supported yet."); - return 0; -} - Index: projects/clang370-import/contrib/llvm/lib/Target/R600/MCTargetDesc/AMDGPUAsmBackend.cpp =================================================================== --- projects/clang370-import/contrib/llvm/lib/Target/R600/MCTargetDesc/AMDGPUAsmBackend.cpp (revision 288077) +++ projects/clang370-import/contrib/llvm/lib/Target/R600/MCTargetDesc/AMDGPUAsmBackend.cpp (nonexistent) @@ -1,145 +0,0 @@ -//===-- AMDGPUAsmBackend.cpp - AMDGPU Assembler Backend -------------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -/// \file -//===----------------------------------------------------------------------===// - -#include "MCTargetDesc/AMDGPUMCTargetDesc.h" -#include "MCTargetDesc/AMDGPUFixupKinds.h" -#include "llvm/ADT/StringRef.h" -#include "llvm/MC/MCAsmBackend.h" -#include "llvm/MC/MCAssembler.h" -#include "llvm/MC/MCFixupKindInfo.h" -#include "llvm/MC/MCObjectWriter.h" -#include "llvm/MC/MCValue.h" -#include "llvm/Support/TargetRegistry.h" - -using namespace llvm; - -namespace { - -class AMDGPUMCObjectWriter : public MCObjectWriter { -public: - AMDGPUMCObjectWriter(raw_pwrite_stream &OS) : MCObjectWriter(OS, true) {} - void executePostLayoutBinding(MCAssembler &Asm, - const MCAsmLayout &Layout) override { - //XXX: Implement if necessary. - } - void recordRelocation(MCAssembler &Asm, const MCAsmLayout &Layout, - const MCFragment *Fragment, const MCFixup &Fixup, - MCValue Target, bool &IsPCRel, - uint64_t &FixedValue) override { - assert(!"Not implemented"); - } - - void writeObject(MCAssembler &Asm, const MCAsmLayout &Layout) override; - -}; - -class AMDGPUAsmBackend : public MCAsmBackend { -public: - AMDGPUAsmBackend(const Target &T) - : MCAsmBackend() {} - - unsigned getNumFixupKinds() const override { return AMDGPU::NumTargetFixupKinds; }; - void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize, - uint64_t Value, bool IsPCRel) const override; - bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value, - const MCRelaxableFragment *DF, - const MCAsmLayout &Layout) const override { - return false; - } - void relaxInstruction(const MCInst &Inst, MCInst &Res) const override { - assert(!"Not implemented"); - } - bool mayNeedRelaxation(const MCInst &Inst) const override { return false; } - bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override; - - const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override; -}; - -} //End anonymous namespace - -void AMDGPUMCObjectWriter::writeObject(MCAssembler &Asm, - const MCAsmLayout &Layout) { - for (MCAssembler::iterator I = Asm.begin(), E = Asm.end(); I != E; ++I) { - Asm.writeSectionData(&*I, Layout); - } -} - -void AMDGPUAsmBackend::applyFixup(const MCFixup &Fixup, char *Data, - unsigned DataSize, uint64_t Value, - bool IsPCRel) const { - - switch ((unsigned)Fixup.getKind()) { - default: llvm_unreachable("Unknown fixup kind"); - case AMDGPU::fixup_si_sopp_br: { - uint16_t *Dst = (uint16_t*)(Data + Fixup.getOffset()); - *Dst = (Value - 4) / 4; - break; - } - - case AMDGPU::fixup_si_rodata: { - uint32_t *Dst = (uint32_t*)(Data + Fixup.getOffset()); - *Dst = Value; - break; - } - - case AMDGPU::fixup_si_end_of_text: { - uint32_t *Dst = (uint32_t*)(Data + Fixup.getOffset()); - // The value points to the last instruction in the text section, so we - // need to add 4 bytes to get to the start of the constants. - *Dst = Value + 4; - break; - } - } -} - -const MCFixupKindInfo &AMDGPUAsmBackend::getFixupKindInfo( - MCFixupKind Kind) const { - const static MCFixupKindInfo Infos[AMDGPU::NumTargetFixupKinds] = { - // name offset bits flags - { "fixup_si_sopp_br", 0, 16, MCFixupKindInfo::FKF_IsPCRel }, - { "fixup_si_rodata", 0, 32, 0 }, - { "fixup_si_end_of_text", 0, 32, MCFixupKindInfo::FKF_IsPCRel } - }; - - if (Kind < FirstTargetFixupKind) - return MCAsmBackend::getFixupKindInfo(Kind); - - return Infos[Kind - FirstTargetFixupKind]; -} - -bool AMDGPUAsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const { - OW->WriteZeros(Count); - - return true; -} - -//===----------------------------------------------------------------------===// -// ELFAMDGPUAsmBackend class -//===----------------------------------------------------------------------===// - -namespace { - -class ELFAMDGPUAsmBackend : public AMDGPUAsmBackend { -public: - ELFAMDGPUAsmBackend(const Target &T) : AMDGPUAsmBackend(T) { } - - MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override { - return createAMDGPUELFObjectWriter(OS); - } -}; - -} // end anonymous namespace - -MCAsmBackend *llvm::createAMDGPUAsmBackend(const Target &T, - const MCRegisterInfo &MRI, - StringRef TT, - StringRef CPU) { - return new ELFAMDGPUAsmBackend(T); -} Index: projects/clang370-import/contrib/llvm/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.cpp =================================================================== --- projects/clang370-import/contrib/llvm/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.cpp (revision 288077) +++ projects/clang370-import/contrib/llvm/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.cpp (nonexistent) @@ -1,43 +0,0 @@ -//===-- MCTargetDesc/AMDGPUMCAsmInfo.cpp - Assembly Info ------------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -/// \file -//===----------------------------------------------------------------------===// - -#include "AMDGPUMCAsmInfo.h" - -using namespace llvm; -AMDGPUMCAsmInfo::AMDGPUMCAsmInfo(const Triple &TT) : MCAsmInfoELF() { - HasSingleParameterDotFile = false; - //===------------------------------------------------------------------===// - MaxInstLength = 16; - SeparatorString = "\n"; - CommentString = ";"; - PrivateLabelPrefix = ""; - InlineAsmStart = ";#ASMSTART"; - InlineAsmEnd = ";#ASMEND"; - - //===--- Data Emission Directives -------------------------------------===// - ZeroDirective = ".zero"; - AsciiDirective = ".ascii\t"; - AscizDirective = ".asciz\t"; - Data8bitsDirective = ".byte\t"; - Data16bitsDirective = ".short\t"; - Data32bitsDirective = ".long\t"; - Data64bitsDirective = ".quad\t"; - SunStyleELFSectionSwitchSyntax = true; - UsesELFSectionDirectiveForBSS = true; - - //===--- Global Variable Emission Directives --------------------------===// - HasAggressiveSymbolFolding = true; - COMMDirectiveAlignmentIsInBytes = false; - HasDotTypeDotSizeDirective = false; - HasNoDeadStrip = true; - WeakRefDirective = ".weakref\t"; - //===--- Dwarf Emission Directives -----------------------------------===// - SupportsDebugInformation = true; -} Index: projects/clang370-import/contrib/llvm/lib/Target/R600/MCTargetDesc/AMDGPUELFObjectWriter.cpp =================================================================== --- projects/clang370-import/contrib/llvm/lib/Target/R600/MCTargetDesc/AMDGPUELFObjectWriter.cpp (revision 288077) +++ projects/clang370-import/contrib/llvm/lib/Target/R600/MCTargetDesc/AMDGPUELFObjectWriter.cpp (nonexistent) @@ -1,39 +0,0 @@ -//===-- AMDGPUELFObjectWriter.cpp - AMDGPU ELF Writer ----------------------==// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -/// \file -//===----------------------------------------------------------------------===// - -#include "AMDGPUMCTargetDesc.h" -#include "llvm/MC/MCELFObjectWriter.h" -#include "llvm/MC/MCFixup.h" - -using namespace llvm; - -namespace { - -class AMDGPUELFObjectWriter : public MCELFObjectTargetWriter { -public: - AMDGPUELFObjectWriter(); -protected: - unsigned GetRelocType(const MCValue &Target, const MCFixup &Fixup, - bool IsPCRel) const override { - return Fixup.getKind(); - } - -}; - - -} // End anonymous namespace - -AMDGPUELFObjectWriter::AMDGPUELFObjectWriter() - : MCELFObjectTargetWriter(false, 0, 0, false) { } - -MCObjectWriter *llvm::createAMDGPUELFObjectWriter(raw_pwrite_stream &OS) { - MCELFObjectTargetWriter *MOTW = new AMDGPUELFObjectWriter(); - return createELFObjectWriter(MOTW, OS, true); -} Index: projects/clang370-import/contrib/llvm/lib/Target/R600/MCTargetDesc/AMDGPUMCCodeEmitter.h =================================================================== --- projects/clang370-import/contrib/llvm/lib/Target/R600/MCTargetDesc/AMDGPUMCCodeEmitter.h (revision 288077) +++ projects/clang370-import/contrib/llvm/lib/Target/R600/MCTargetDesc/AMDGPUMCCodeEmitter.h (nonexistent) @@ -1,50 +0,0 @@ -//===-- AMDGPUCodeEmitter.h - AMDGPU Code Emitter interface -----*- C++ -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -/// \file -/// \brief CodeEmitter interface for R600 and SI codegen. -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIB_TARGET_R600_MCTARGETDESC_AMDGPUMCCODEEMITTER_H -#define LLVM_LIB_TARGET_R600_MCTARGETDESC_AMDGPUMCCODEEMITTER_H - -#include "llvm/MC/MCCodeEmitter.h" -#include "llvm/Support/raw_ostream.h" - -namespace llvm { - -class MCInst; -class MCOperand; -class MCSubtargetInfo; - -class AMDGPUMCCodeEmitter : public MCCodeEmitter { - virtual void anchor(); -public: - - uint64_t getBinaryCodeForInstr(const MCInst &MI, - SmallVectorImpl &Fixups, - const MCSubtargetInfo &STI) const; - - virtual uint64_t getMachineOpValue(const MCInst &MI, const MCOperand &MO, - SmallVectorImpl &Fixups, - const MCSubtargetInfo &STI) const { - return 0; - } - - virtual unsigned getSOPPBrEncoding(const MCInst &MI, unsigned OpNo, - SmallVectorImpl &Fixups, - const MCSubtargetInfo &STI) const { - return 0; - } -}; - -} // End namespace llvm - -#endif Index: projects/clang370-import/contrib/llvm/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.h =================================================================== --- projects/clang370-import/contrib/llvm/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.h (revision 288077) +++ projects/clang370-import/contrib/llvm/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.h (nonexistent) @@ -1,32 +0,0 @@ -//===-- MCTargetDesc/AMDGPUMCAsmInfo.h - AMDGPU MCAsm Interface -*- C++ -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -/// \file -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIB_TARGET_R600_MCTARGETDESC_AMDGPUMCASMINFO_H -#define LLVM_LIB_TARGET_R600_MCTARGETDESC_AMDGPUMCASMINFO_H - -#include "llvm/MC/MCAsmInfoELF.h" -namespace llvm { - -class Triple; - -// If you need to create another MCAsmInfo class, which inherits from MCAsmInfo, -// you will need to make sure your new class sets PrivateGlobalPrefix to -// a prefix that won't appeary in a fuction name. The default value -// for PrivateGlobalPrefix is 'L', so it will consider any function starting -// with 'L' as a local symbol. -class AMDGPUMCAsmInfo : public MCAsmInfoELF { -public: - explicit AMDGPUMCAsmInfo(const Triple &TT); -}; -} // namespace llvm -#endif Index: projects/clang370-import/contrib/llvm/lib/Target/R600/SISchedule.td =================================================================== --- projects/clang370-import/contrib/llvm/lib/Target/R600/SISchedule.td (revision 288077) +++ projects/clang370-import/contrib/llvm/lib/Target/R600/SISchedule.td (nonexistent) @@ -1,91 +0,0 @@ -//===-- SISchedule.td - SI Scheduling definitons -------------------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// MachineModel definitions for Southern Islands (SI) -// -//===----------------------------------------------------------------------===// - -def WriteBranch : SchedWrite; -def WriteExport : SchedWrite; -def WriteLDS : SchedWrite; -def WriteSALU : SchedWrite; -def WriteSMEM : SchedWrite; -def WriteVMEM : SchedWrite; - -// Vector ALU instructions -def Write32Bit : SchedWrite; -def WriteQuarterRate32 : SchedWrite; - -def WriteFloatFMA : SchedWrite; - -def WriteDouble : SchedWrite; -def WriteDoubleAdd : SchedWrite; - -def SIFullSpeedModel : SchedMachineModel; -def SIQuarterSpeedModel : SchedMachineModel; - -// BufferSize = 0 means the processors are in-order. -let BufferSize = 0 in { - -// XXX: Are the resource counts correct? -def HWBranch : ProcResource<1>; -def HWExport : ProcResource<7>; // Taken from S_WAITCNT -def HWLGKM : ProcResource<31>; // Taken from S_WAITCNT -def HWSALU : ProcResource<1>; -def HWVMEM : ProcResource<15>; // Taken from S_WAITCNT -def HWVALU : ProcResource<1>; - -} - -class HWWriteRes resources, - int latency> : WriteRes { - let Latency = latency; -} - -class HWVALUWriteRes : - HWWriteRes; - - -// The latency numbers are taken from AMD Accelerated Parallel Processing -// guide. They may not be acurate. - -// The latency values are 1 / (operations / cycle) / 4. -multiclass SICommonWriteRes { - - def : HWWriteRes; // XXX: Guessed ??? - def : HWWriteRes; // XXX: Guessed ??? - def : HWWriteRes; // 2 - 64 - def : HWWriteRes; - def : HWWriteRes; // XXX: Guessed ??? - def : HWWriteRes; // 300 - 600 - - def : HWVALUWriteRes; - def : HWVALUWriteRes; -} - - -let SchedModel = SIFullSpeedModel in { - -defm : SICommonWriteRes; - -def : HWVALUWriteRes; -def : HWVALUWriteRes; -def : HWVALUWriteRes; - -} // End SchedModel = SIFullSpeedModel - -let SchedModel = SIQuarterSpeedModel in { - -defm : SICommonWriteRes; - -def : HWVALUWriteRes; -def : HWVALUWriteRes; -def : HWVALUWriteRes; - -} // End SchedModel = SIQuarterSpeedModel Index: projects/clang370-import/contrib/llvm/lib/Target/R600/AMDKernelCodeT.h =================================================================== --- projects/clang370-import/contrib/llvm/lib/Target/R600/AMDKernelCodeT.h (revision 288077) +++ projects/clang370-import/contrib/llvm/lib/Target/R600/AMDKernelCodeT.h (nonexistent) @@ -1,704 +0,0 @@ -//===-- AMDGPUKernelCodeT.h - Print AMDGPU assembly code ---------*- C++ -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -/// \file AMDKernelCodeT.h -//===----------------------------------------------------------------------===// - -#ifndef AMDKERNELCODET_H -#define AMDKERNELCODET_H - -#include -#include - -//---------------------------------------------------------------------------// -// AMD Kernel Code, and its dependencies // -//---------------------------------------------------------------------------// - -typedef uint8_t hsa_powertwo8_t; -typedef uint32_t hsa_ext_code_kind_t; -typedef uint8_t hsa_ext_brig_profile8_t; -typedef uint8_t hsa_ext_brig_machine_model8_t; -typedef uint64_t hsa_ext_control_directive_present64_t; -typedef uint16_t hsa_ext_exception_kind16_t; -typedef uint32_t hsa_ext_code_kind32_t; - -typedef struct hsa_dim3_s { - uint32_t x; - uint32_t y; - uint32_t z; -} hsa_dim3_t; - -/// The version of the amd_*_code_t struct. Minor versions must be -/// backward compatible. -typedef uint32_t amd_code_version32_t; -enum amd_code_version_t { - AMD_CODE_VERSION_MAJOR = 0, - AMD_CODE_VERSION_MINOR = 1 -}; - -/// The values used to define the number of bytes to use for the -/// swizzle element size. -enum amd_element_byte_size_t { - AMD_ELEMENT_2_BYTES = 0, - AMD_ELEMENT_4_BYTES = 1, - AMD_ELEMENT_8_BYTES = 2, - AMD_ELEMENT_16_BYTES = 3 -}; - -/// Shader program settings for CS. Contains COMPUTE_PGM_RSRC1 and -/// COMPUTE_PGM_RSRC2 registers. -typedef uint64_t amd_compute_pgm_resource_register64_t; - -/// Every amd_*_code_t has the following properties, which are composed of -/// a number of bit fields. Every bit field has a mask (AMD_CODE_PROPERTY_*), -/// bit width (AMD_CODE_PROPERTY_*_WIDTH, and bit shift amount -/// (AMD_CODE_PROPERTY_*_SHIFT) for convenient access. Unused bits must be 0. -/// -/// (Note that bit fields cannot be used as their layout is -/// implementation defined in the C standard and so cannot be used to -/// specify an ABI) -typedef uint32_t amd_code_property32_t; -enum amd_code_property_mask_t { - - /// Enable the setup of the SGPR user data registers - /// (AMD_CODE_PROPERTY_ENABLE_SGPR_*), see documentation of amd_kernel_code_t - /// for initial register state. - /// - /// The total number of SGPRuser data registers requested must not - /// exceed 16. Any requests beyond 16 will be ignored. - /// - /// Used to set COMPUTE_PGM_RSRC2.USER_SGPR (set to total count of - /// SGPR user data registers enabled up to 16). - - AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER_SHIFT = 0, - AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER_WIDTH = 1, - AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER_SHIFT, - - AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR_SHIFT = 1, - AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR_WIDTH = 1, - AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR_SHIFT, - - AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR_SHIFT = 2, - AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR_WIDTH = 1, - AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR_SHIFT, - - AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR_SHIFT = 3, - AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR_WIDTH = 1, - AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR_SHIFT, - - AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID_SHIFT = 4, - AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID_WIDTH = 1, - AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID_SHIFT, - - AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT_SHIFT = 5, - AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT_WIDTH = 1, - AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT_SHIFT, - - AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE_SHIFT = 6, - AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE_WIDTH = 1, - AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE_SHIFT, - - AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X_SHIFT = 7, - AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X_WIDTH = 1, - AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X_SHIFT, - - AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y_SHIFT = 8, - AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y_WIDTH = 1, - AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y_SHIFT, - - AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z_SHIFT = 9, - AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z_WIDTH = 1, - AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z_SHIFT, - - /// Control wave ID base counter for GDS ordered-append. Used to set - /// COMPUTE_DISPATCH_INITIATOR.ORDERED_APPEND_ENBL. (Not sure if - /// ORDERED_APPEND_MODE also needs to be settable) - AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS_SHIFT = 10, - AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS_WIDTH = 1, - AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS = ((1 << AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS_SHIFT, - - /// The interleave (swizzle) element size in bytes required by the - /// code for private memory. This must be 2, 4, 8 or 16. This value - /// is provided to the finalizer when it is invoked and is recorded - /// here. The hardware will interleave the memory requests of each - /// lane of a wavefront by this element size to ensure each - /// work-item gets a distinct memory memory location. Therefore, the - /// finalizer ensures that all load and store operations done to - /// private memory do not exceed this size. For example, if the - /// element size is 4 (32-bits or dword) and a 64-bit value must be - /// loaded, the finalizer will generate two 32-bit loads. This - /// ensures that the interleaving will get the the work-item - /// specific dword for both halves of the 64-bit value. If it just - /// did a 64-bit load then it would get one dword which belonged to - /// its own work-item, but the second dword would belong to the - /// adjacent lane work-item since the interleaving is in dwords. - /// - /// The value used must match the value that the runtime configures - /// the GPU flat scratch (SH_STATIC_MEM_CONFIG.ELEMENT_SIZE). This - /// is generally DWORD. - /// - /// Use values from the amd_element_byte_size_t enum. - AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE_SHIFT = 11, - AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE_WIDTH = 2, - AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE = ((1 << AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE_WIDTH) - 1) << AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE_SHIFT, - - /// Are global memory addresses 64 bits. Must match - /// amd_kernel_code_t.hsail_machine_model == - /// HSA_MACHINE_LARGE. Must also match - /// SH_MEM_CONFIG.PTR32 (GFX6 (SI)/GFX7 (CI)), - /// SH_MEM_CONFIG.ADDRESS_MODE (GFX8 (VI)+). - AMD_CODE_PROPERTY_IS_PTR64_SHIFT = 13, - AMD_CODE_PROPERTY_IS_PTR64_WIDTH = 1, - AMD_CODE_PROPERTY_IS_PTR64 = ((1 << AMD_CODE_PROPERTY_IS_PTR64_WIDTH) - 1) << AMD_CODE_PROPERTY_IS_PTR64_SHIFT, - - /// Indicate if the generated ISA is using a dynamically sized call - /// stack. This can happen if calls are implemented using a call - /// stack and recursion, alloca or calls to indirect functions are - /// present. In these cases the Finalizer cannot compute the total - /// private segment size at compile time. In this case the - /// workitem_private_segment_byte_size only specifies the statically - /// know private segment size, and additional space must be added - /// for the call stack. - AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK_SHIFT = 14, - AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK_WIDTH = 1, - AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK = ((1 << AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK_WIDTH) - 1) << AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK_SHIFT, - - /// Indicate if code generated has support for debugging. - AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED_SHIFT = 15, - AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED_WIDTH = 1, - AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED = ((1 << AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED_WIDTH) - 1) << AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED_SHIFT -}; - -/// @brief The hsa_ext_control_directives_t specifies the values for the HSAIL -/// control directives. These control how the finalizer generates code. This -/// struct is used both as an argument to hsaFinalizeKernel to specify values for -/// the control directives, and is used in HsaKernelCode to record the values of -/// the control directives that the finalize used when generating the code which -/// either came from the finalizer argument or explicit HSAIL control -/// directives. See the definition of the control directives in HSA Programmer's -/// Reference Manual which also defines how the values specified as finalizer -/// arguments have to agree with the control directives in the HSAIL code. -typedef struct hsa_ext_control_directives_s { - /// This is a bit set indicating which control directives have been - /// specified. If the value is 0 then there are no control directives specified - /// and the rest of the fields can be ignored. The bits are accessed using the - /// hsa_ext_control_directives_present_mask_t. Any control directive that is not - /// enabled in this bit set must have the value of all 0s. - hsa_ext_control_directive_present64_t enabled_control_directives; - - /// If enableBreakExceptions is not enabled then must be 0, otherwise must be - /// non-0 and specifies the set of HSAIL exceptions that must have the BREAK - /// policy enabled. If this set is not empty then the generated code may have - /// lower performance than if the set is empty. If the kernel being finalized - /// has any enablebreakexceptions control directives, then the values specified - /// by this argument are unioned with the values in these control - /// directives. If any of the functions the kernel calls have an - /// enablebreakexceptions control directive, then they must be equal or a - /// subset of, this union. - hsa_ext_exception_kind16_t enable_break_exceptions; - - /// If enableDetectExceptions is not enabled then must be 0, otherwise must be - /// non-0 and specifies the set of HSAIL exceptions that must have the DETECT - /// policy enabled. If this set is not empty then the generated code may have - /// lower performance than if the set is empty. However, an implementation - /// should endeavour to make the performance impact small. If the kernel being - /// finalized has any enabledetectexceptions control directives, then the - /// values specified by this argument are unioned with the values in these - /// control directives. If any of the functions the kernel calls have an - /// enabledetectexceptions control directive, then they must be equal or a - /// subset of, this union. - hsa_ext_exception_kind16_t enable_detect_exceptions; - - /// If maxDynamicGroupSize is not enabled then must be 0, and any amount of - /// dynamic group segment can be allocated for a dispatch, otherwise the value - /// specifies the maximum number of bytes of dynamic group segment that can be - /// allocated for a dispatch. If the kernel being finalized has any - /// maxdynamicsize control directives, then the values must be the same, and - /// must be the same as this argument if it is enabled. This value can be used - /// by the finalizer to determine the maximum number of bytes of group memory - /// used by each work-group by adding this value to the group memory required - /// for all group segment variables used by the kernel and all functions it - /// calls, and group memory used to implement other HSAIL features such as - /// fbarriers and the detect exception operations. This can allow the finalizer - /// to determine the expected number of work-groups that can be executed by a - /// compute unit and allow more resources to be allocated to the work-items if - /// it is known that fewer work-groups can be executed due to group memory - /// limitations. - uint32_t max_dynamic_group_size; - - /// If maxFlatGridSize is not enabled then must be 0, otherwise must be greater - /// than 0. See HSA Programmer's Reference Manual description of - /// maxflatgridsize control directive. - uint32_t max_flat_grid_size; - - /// If maxFlatWorkgroupSize is not enabled then must be 0, otherwise must be - /// greater than 0. See HSA Programmer's Reference Manual description of - /// maxflatworkgroupsize control directive. - uint32_t max_flat_workgroup_size; - - /// If requestedWorkgroupsPerCu is not enabled then must be 0, and the - /// finalizer is free to generate ISA that may result in any number of - /// work-groups executing on a single compute unit. Otherwise, the finalizer - /// should attempt to generate ISA that will allow the specified number of - /// work-groups to execute on a single compute unit. This is only a hint and - /// can be ignored by the finalizer. If the kernel being finalized, or any of - /// the functions it calls, has a requested control directive, then the values - /// must be the same. This can be used to determine the number of resources - /// that should be allocated to a single work-group and work-item. For example, - /// a low value may allow more resources to be allocated, resulting in higher - /// per work-item performance, as it is known there will never be more than the - /// specified number of work-groups actually executing on the compute - /// unit. Conversely, a high value may allocate fewer resources, resulting in - /// lower per work-item performance, which is offset by the fact it allows more - /// work-groups to actually execute on the compute unit. - uint32_t requested_workgroups_per_cu; - - /// If not enabled then all elements for Dim3 must be 0, otherwise every - /// element must be greater than 0. See HSA Programmer's Reference Manual - /// description of requiredgridsize control directive. - hsa_dim3_t required_grid_size; - - /// If requiredWorkgroupSize is not enabled then all elements for Dim3 must be - /// 0, and the produced code can be dispatched with any legal work-group range - /// consistent with the dispatch dimensions. Otherwise, the code produced must - /// always be dispatched with the specified work-group range. No element of the - /// specified range must be 0. It must be consistent with required_dimensions - /// and max_flat_workgroup_size. If the kernel being finalized, or any of the - /// functions it calls, has a requiredworkgroupsize control directive, then the - /// values must be the same. Specifying a value can allow the finalizer to - /// optimize work-group id operations, and if the number of work-items in the - /// work-group is less than the WAVESIZE then barrier operations can be - /// optimized to just a memory fence. - hsa_dim3_t required_workgroup_size; - - /// If requiredDim is not enabled then must be 0 and the produced kernel code - /// can be dispatched with 1, 2 or 3 dimensions. If enabled then the value is - /// 1..3 and the code produced must only be dispatched with a dimension that - /// matches. Other values are illegal. If the kernel being finalized, or any of - /// the functions it calls, has a requireddimsize control directive, then the - /// values must be the same. This can be used to optimize the code generated to - /// compute the absolute and flat work-group and work-item id, and the dim - /// HSAIL operations. - uint8_t required_dim; - - /// Reserved. Must be 0. - uint8_t reserved[75]; -} hsa_ext_control_directives_t; - -/// AMD Kernel Code Object (amd_kernel_code_t). GPU CP uses the AMD Kernel -/// Code Object to set up the hardware to execute the kernel dispatch. -/// -/// Initial Kernel Register State. -/// -/// Initial kernel register state will be set up by CP/SPI prior to the start -/// of execution of every wavefront. This is limited by the constraints of the -/// current hardware. -/// -/// The order of the SGPR registers is defined, but the Finalizer can specify -/// which ones are actually setup in the amd_kernel_code_t object using the -/// enable_sgpr_* bit fields. The register numbers used for enabled registers -/// are dense starting at SGPR0: the first enabled register is SGPR0, the next -/// enabled register is SGPR1 etc.; disabled registers do not have an SGPR -/// number. -/// -/// The initial SGPRs comprise up to 16 User SRGPs that are set up by CP and -/// apply to all waves of the grid. It is possible to specify more than 16 User -/// SGPRs using the enable_sgpr_* bit fields, in which case only the first 16 -/// are actually initialized. These are then immediately followed by the System -/// SGPRs that are set up by ADC/SPI and can have different values for each wave -/// of the grid dispatch. -/// -/// SGPR register initial state is defined as follows: -/// -/// Private Segment Buffer (enable_sgpr_private_segment_buffer): -/// Number of User SGPR registers: 4. V# that can be used, together with -/// Scratch Wave Offset as an offset, to access the Private/Spill/Arg -/// segments using a segment address. It must be set as follows: -/// - Base address: of the scratch memory area used by the dispatch. It -/// does not include the scratch wave offset. It will be the per process -/// SH_HIDDEN_PRIVATE_BASE_VMID plus any offset from this dispatch (for -/// example there may be a per pipe offset, or per AQL Queue offset). -/// - Stride + data_format: Element Size * Index Stride (???) -/// - Cache swizzle: ??? -/// - Swizzle enable: SH_STATIC_MEM_CONFIG.SWIZZLE_ENABLE (must be 1 for -/// scratch) -/// - Num records: Flat Scratch Work Item Size / Element Size (???) -/// - Dst_sel_*: ??? -/// - Num_format: ??? -/// - Element_size: SH_STATIC_MEM_CONFIG.ELEMENT_SIZE (will be DWORD, must -/// agree with amd_kernel_code_t.privateElementSize) -/// - Index_stride: SH_STATIC_MEM_CONFIG.INDEX_STRIDE (will be 64 as must -/// be number of wavefront lanes for scratch, must agree with -/// amd_kernel_code_t.wavefrontSize) -/// - Add tid enable: 1 -/// - ATC: from SH_MEM_CONFIG.PRIVATE_ATC, -/// - Hash_enable: ??? -/// - Heap: ??? -/// - Mtype: from SH_STATIC_MEM_CONFIG.PRIVATE_MTYPE -/// - Type: 0 (a buffer) (???) -/// -/// Dispatch Ptr (enable_sgpr_dispatch_ptr): -/// Number of User SGPR registers: 2. 64 bit address of AQL dispatch packet -/// for kernel actually executing. -/// -/// Queue Ptr (enable_sgpr_queue_ptr): -/// Number of User SGPR registers: 2. 64 bit address of AmdQueue object for -/// AQL queue on which the dispatch packet was queued. -/// -/// Kernarg Segment Ptr (enable_sgpr_kernarg_segment_ptr): -/// Number of User SGPR registers: 2. 64 bit address of Kernarg segment. This -/// is directly copied from the kernargPtr in the dispatch packet. Having CP -/// load it once avoids loading it at the beginning of every wavefront. -/// -/// Dispatch Id (enable_sgpr_dispatch_id): -/// Number of User SGPR registers: 2. 64 bit Dispatch ID of the dispatch -/// packet being executed. -/// -/// Flat Scratch Init (enable_sgpr_flat_scratch_init): -/// Number of User SGPR registers: 2. This is 2 SGPRs. -/// -/// For CI/VI: -/// The first SGPR is a 32 bit byte offset from SH_MEM_HIDDEN_PRIVATE_BASE -/// to base of memory for scratch for this dispatch. This is the same offset -/// used in computing the Scratch Segment Buffer base address. The value of -/// Scratch Wave Offset must be added by the kernel code and moved to -/// SGPRn-4 for use as the FLAT SCRATCH BASE in flat memory instructions. -/// -/// The second SGPR is 32 bit byte size of a single work-item’s scratch -/// memory usage. This is directly loaded from the dispatch packet Private -/// Segment Byte Size and rounded up to a multiple of DWORD. -/// -/// \todo [Does CP need to round this to >4 byte alignment?] -/// -/// The kernel code must move to SGPRn-3 for use as the FLAT SCRATCH SIZE in -/// flat memory instructions. Having CP load it once avoids loading it at -/// the beginning of every wavefront. -/// -/// For PI: -/// This is the 64 bit base address of the scratch backing memory for -/// allocated by CP for this dispatch. -/// -/// Private Segment Size (enable_sgpr_private_segment_size): -/// Number of User SGPR registers: 1. The 32 bit byte size of a single -/// work-item’s scratch memory allocation. This is the value from the dispatch -/// packet. Private Segment Byte Size rounded up by CP to a multiple of DWORD. -/// -/// \todo [Does CP need to round this to >4 byte alignment?] -/// -/// Having CP load it once avoids loading it at the beginning of every -/// wavefront. -/// -/// \todo [This will not be used for CI/VI since it is the same value as -/// the second SGPR of Flat Scratch Init. However, it is need for PI which -/// changes meaning of Flat Scratchg Init..] -/// -/// Grid Work-Group Count X (enable_sgpr_grid_workgroup_count_x): -/// Number of User SGPR registers: 1. 32 bit count of the number of -/// work-groups in the X dimension for the grid being executed. Computed from -/// the fields in the HsaDispatchPacket as -/// ((gridSize.x+workgroupSize.x-1)/workgroupSize.x). -/// -/// Grid Work-Group Count Y (enable_sgpr_grid_workgroup_count_y): -/// Number of User SGPR registers: 1. 32 bit count of the number of -/// work-groups in the Y dimension for the grid being executed. Computed from -/// the fields in the HsaDispatchPacket as -/// ((gridSize.y+workgroupSize.y-1)/workgroupSize.y). -/// -/// Only initialized if <16 previous SGPRs initialized. -/// -/// Grid Work-Group Count Z (enable_sgpr_grid_workgroup_count_z): -/// Number of User SGPR registers: 1. 32 bit count of the number of -/// work-groups in the Z dimension for the grid being executed. Computed -/// from the fields in the HsaDispatchPacket as -/// ((gridSize.z+workgroupSize.z-1)/workgroupSize.z). -/// -/// Only initialized if <16 previous SGPRs initialized. -/// -/// Work-Group Id X (enable_sgpr_workgroup_id_x): -/// Number of System SGPR registers: 1. 32 bit work group id in X dimension -/// of grid for wavefront. Always present. -/// -/// Work-Group Id Y (enable_sgpr_workgroup_id_y): -/// Number of System SGPR registers: 1. 32 bit work group id in Y dimension -/// of grid for wavefront. -/// -/// Work-Group Id Z (enable_sgpr_workgroup_id_z): -/// Number of System SGPR registers: 1. 32 bit work group id in Z dimension -/// of grid for wavefront. If present then Work-group Id Y will also be -/// present -/// -/// Work-Group Info (enable_sgpr_workgroup_info): -/// Number of System SGPR registers: 1. {first_wave, 14’b0000, -/// ordered_append_term[10:0], threadgroup_size_in_waves[5:0]} -/// -/// Private Segment Wave Byte Offset -/// (enable_sgpr_private_segment_wave_byte_offset): -/// Number of System SGPR registers: 1. 32 bit byte offset from base of -/// dispatch scratch base. Must be used as an offset with Private/Spill/Arg -/// segment address when using Scratch Segment Buffer. It must be added to -/// Flat Scratch Offset if setting up FLAT SCRATCH for flat addressing. -/// -/// -/// The order of the VGPR registers is defined, but the Finalizer can specify -/// which ones are actually setup in the amd_kernel_code_t object using the -/// enableVgpr* bit fields. The register numbers used for enabled registers -/// are dense starting at VGPR0: the first enabled register is VGPR0, the next -/// enabled register is VGPR1 etc.; disabled registers do not have an VGPR -/// number. -/// -/// VGPR register initial state is defined as follows: -/// -/// Work-Item Id X (always initialized): -/// Number of registers: 1. 32 bit work item id in X dimension of work-group -/// for wavefront lane. -/// -/// Work-Item Id X (enable_vgpr_workitem_id > 0): -/// Number of registers: 1. 32 bit work item id in Y dimension of work-group -/// for wavefront lane. -/// -/// Work-Item Id X (enable_vgpr_workitem_id > 0): -/// Number of registers: 1. 32 bit work item id in Z dimension of work-group -/// for wavefront lane. -/// -/// -/// The setting of registers is being done by existing GPU hardware as follows: -/// 1) SGPRs before the Work-Group Ids are set by CP using the 16 User Data -/// registers. -/// 2) Work-group Id registers X, Y, Z are set by SPI which supports any -/// combination including none. -/// 3) Scratch Wave Offset is also set by SPI which is why its value cannot -/// be added into the value Flat Scratch Offset which would avoid the -/// Finalizer generated prolog having to do the add. -/// 4) The VGPRs are set by SPI which only supports specifying either (X), -/// (X, Y) or (X, Y, Z). -/// -/// Flat Scratch Dispatch Offset and Flat Scratch Size are adjacent SGRRs so -/// they can be moved as a 64 bit value to the hardware required SGPRn-3 and -/// SGPRn-4 respectively using the Finalizer ?FLAT_SCRATCH? Register. -/// -/// The global segment can be accessed either using flat operations or buffer -/// operations. If buffer operations are used then the Global Buffer used to -/// access HSAIL Global/Readonly/Kernarg (which are combine) segments using a -/// segment address is not passed into the kernel code by CP since its base -/// address is always 0. Instead the Finalizer generates prolog code to -/// initialize 4 SGPRs with a V# that has the following properties, and then -/// uses that in the buffer instructions: -/// - base address of 0 -/// - no swizzle -/// - ATC=1 -/// - MTYPE set to support memory coherence specified in -/// amd_kernel_code_t.globalMemoryCoherence -/// -/// When the Global Buffer is used to access the Kernarg segment, must add the -/// dispatch packet kernArgPtr to a kernarg segment address before using this V#. -/// Alternatively scalar loads can be used if the kernarg offset is uniform, as -/// the kernarg segment is constant for the duration of the kernel execution. -/// -typedef struct amd_kernel_code_s { - /// The AMD major version of the Code Object. Must be the value - /// AMD_CODE_VERSION_MAJOR. - amd_code_version32_t amd_code_version_major; - - /// The AMD minor version of the Code Object. Minor versions must be - /// backward compatible. Must be the value - /// AMD_CODE_VERSION_MINOR. - amd_code_version32_t amd_code_version_minor; - - /// The byte size of this struct. Must be set to - /// sizeof(amd_kernel_code_t). Used for backward - /// compatibility. - uint32_t struct_byte_size; - - /// The target chip instruction set for which code has been - /// generated. Values are from the E_SC_INSTRUCTION_SET enumeration - /// in sc/Interface/SCCommon.h. - uint32_t target_chip; - - /// Byte offset (possibly negative) from start of amd_kernel_code_t - /// object to kernel's entry point instruction. The actual code for - /// the kernel is required to be 256 byte aligned to match hardware - /// requirements (SQ cache line is 16). The code must be position - /// independent code (PIC) for AMD devices to give runtime the - /// option of copying code to discrete GPU memory or APU L2 - /// cache. The Finalizer should endeavour to allocate all kernel - /// machine code in contiguous memory pages so that a device - /// pre-fetcher will tend to only pre-fetch Kernel Code objects, - /// improving cache performance. - int64_t kernel_code_entry_byte_offset; - - /// Range of bytes to consider prefetching expressed as an offset - /// and size. The offset is from the start (possibly negative) of - /// amd_kernel_code_t object. Set both to 0 if no prefetch - /// information is available. - /// - /// \todo ttye 11/15/2013 Is the prefetch definition we want? Did - /// not make the size a uint64_t as prefetching more than 4GiB seems - /// excessive. - int64_t kernel_code_prefetch_byte_offset; - uint64_t kernel_code_prefetch_byte_size; - - /// Number of bytes of scratch backing memory required for full - /// occupancy of target chip. This takes into account the number of - /// bytes of scratch per work-item, the wavefront size, the maximum - /// number of wavefronts per CU, and the number of CUs. This is an - /// upper limit on scratch. If the grid being dispatched is small it - /// may only need less than this. If the kernel uses no scratch, or - /// the Finalizer has not computed this value, it must be 0. - uint64_t max_scratch_backing_memory_byte_size; - - /// Shader program settings for CS. Contains COMPUTE_PGM_RSRC1 and - /// COMPUTE_PGM_RSRC2 registers. - amd_compute_pgm_resource_register64_t compute_pgm_resource_registers; - - /// Code properties. See amd_code_property_mask_t for a full list of - /// properties. - amd_code_property32_t code_properties; - - /// The amount of memory required for the combined private, spill - /// and arg segments for a work-item in bytes. If - /// is_dynamic_callstack is 1 then additional space must be added to - /// this value for the call stack. - uint32_t workitem_private_segment_byte_size; - - /// The amount of group segment memory required by a work-group in - /// bytes. This does not include any dynamically allocated group - /// segment memory that may be added when the kernel is - /// dispatched. - uint32_t workgroup_group_segment_byte_size; - - /// Number of byte of GDS required by kernel dispatch. Must be 0 if - /// not using GDS. - uint32_t gds_segment_byte_size; - - /// The size in bytes of the kernarg segment that holds the values - /// of the arguments to the kernel. This could be used by CP to - /// prefetch the kernarg segment pointed to by the dispatch packet. - uint64_t kernarg_segment_byte_size; - - /// Number of fbarrier's used in the kernel and all functions it - /// calls. If the implementation uses group memory to allocate the - /// fbarriers then that amount must already be included in the - /// workgroup_group_segment_byte_size total. - uint32_t workgroup_fbarrier_count; - - /// Number of scalar registers used by a wavefront. This includes - /// the special SGPRs for VCC, Flat Scratch Base, Flat Scratch Size - /// and XNACK (for GFX8 (VI)). It does not include the 16 SGPR added if a - /// trap handler is enabled. Used to set COMPUTE_PGM_RSRC1.SGPRS. - uint16_t wavefront_sgpr_count; - - /// Number of vector registers used by each work-item. Used to set - /// COMPUTE_PGM_RSRC1.VGPRS. - uint16_t workitem_vgpr_count; - - /// If reserved_vgpr_count is 0 then must be 0. Otherwise, this is the - /// first fixed VGPR number reserved. - uint16_t reserved_vgpr_first; - - /// The number of consecutive VGPRs reserved by the client. If - /// is_debug_supported then this count includes VGPRs reserved - /// for debugger use. - uint16_t reserved_vgpr_count; - - /// If reserved_sgpr_count is 0 then must be 0. Otherwise, this is the - /// first fixed SGPR number reserved. - uint16_t reserved_sgpr_first; - - /// The number of consecutive SGPRs reserved by the client. If - /// is_debug_supported then this count includes SGPRs reserved - /// for debugger use. - uint16_t reserved_sgpr_count; - - /// If is_debug_supported is 0 then must be 0. Otherwise, this is the - /// fixed SGPR number used to hold the wave scratch offset for the - /// entire kernel execution, or uint16_t(-1) if the register is not - /// used or not known. - uint16_t debug_wavefront_private_segment_offset_sgpr; - - /// If is_debug_supported is 0 then must be 0. Otherwise, this is the - /// fixed SGPR number of the first of 4 SGPRs used to hold the - /// scratch V# used for the entire kernel execution, or uint16_t(-1) - /// if the registers are not used or not known. - uint16_t debug_private_segment_buffer_sgpr; - - /// The maximum byte alignment of variables used by the kernel in - /// the specified memory segment. Expressed as a power of two. Must - /// be at least HSA_POWERTWO_16. - hsa_powertwo8_t kernarg_segment_alignment; - hsa_powertwo8_t group_segment_alignment; - hsa_powertwo8_t private_segment_alignment; - - uint8_t reserved3; - - /// Type of code object. - hsa_ext_code_kind32_t code_type; - - /// Reserved for code properties if any are defined in the future. - /// There are currently no code properties so this field must be 0. - uint32_t reserved4; - - /// Wavefront size expressed as a power of two. Must be a power of 2 - /// in range 1..64 inclusive. Used to support runtime query that - /// obtains wavefront size, which may be used by application to - /// allocated dynamic group memory and set the dispatch work-group - /// size. - hsa_powertwo8_t wavefront_size; - - /// The optimization level specified when the kernel was - /// finalized. - uint8_t optimization_level; - - /// The HSAIL profile defines which features are used. This - /// information is from the HSAIL version directive. If this - /// amd_kernel_code_t is not generated from an HSAIL compilation - /// unit then must be 0. - hsa_ext_brig_profile8_t hsail_profile; - - /// The HSAIL machine model gives the address sizes used by the - /// code. This information is from the HSAIL version directive. If - /// not generated from an HSAIL compilation unit then must still - /// indicate for what machine mode the code is generated. - hsa_ext_brig_machine_model8_t hsail_machine_model; - - /// The HSAIL major version. This information is from the HSAIL - /// version directive. If this amd_kernel_code_t is not - /// generated from an HSAIL compilation unit then must be 0. - uint32_t hsail_version_major; - - /// The HSAIL minor version. This information is from the HSAIL - /// version directive. If this amd_kernel_code_t is not - /// generated from an HSAIL compilation unit then must be 0. - uint32_t hsail_version_minor; - - /// Reserved for HSAIL target options if any are defined in the - /// future. There are currently no target options so this field - /// must be 0. - uint16_t reserved5; - - /// Reserved. Must be 0. - uint16_t reserved6; - - /// The values should be the actually values used by the finalizer - /// in generating the code. This may be the union of values - /// specified as finalizer arguments and explicit HSAIL control - /// directives. If the finalizer chooses to ignore a control - /// directive, and not generate constrained code, then the control - /// directive should not be marked as enabled even though it was - /// present in the HSAIL or finalizer argument. The values are - /// intended to reflect the constraints that the code actually - /// requires to correctly execute, not the values that were - /// actually specified at finalize time. - hsa_ext_control_directives_t control_directive; - - /// The code can immediately follow the amd_kernel_code_t, or can - /// come after subsequent amd_kernel_code_t structs when there are - /// multiple kernels in the compilation unit. - -} amd_kernel_code_t; - -#endif // AMDKERNELCODET_H Property changes on: projects/clang370-import/contrib/llvm/lib/Target/R600/AMDKernelCodeT.h ___________________________________________________________________ Deleted: svn:eol-style ## -1 +0,0 ## -native \ No newline at end of property Deleted: svn:keywords ## -1 +0,0 ## -FreeBSD=%H \ No newline at end of property Deleted: svn:mime-type ## -1 +0,0 ## -text/plain \ No newline at end of property Index: projects/clang370-import/contrib/llvm/lib/Target/R600/R600TextureIntrinsicsReplacer.cpp =================================================================== --- projects/clang370-import/contrib/llvm/lib/Target/R600/R600TextureIntrinsicsReplacer.cpp (revision 288077) +++ projects/clang370-import/contrib/llvm/lib/Target/R600/R600TextureIntrinsicsReplacer.cpp (nonexistent) @@ -1,303 +0,0 @@ -//===-- R600TextureIntrinsicsReplacer.cpp ---------------------------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -/// \file -/// This pass translates tgsi-like texture intrinsics into R600 texture -/// closer to hardware intrinsics. -//===----------------------------------------------------------------------===// - -#include "AMDGPU.h" -#include "llvm/ADT/Statistic.h" -#include "llvm/Analysis/Passes.h" -#include "llvm/IR/Function.h" -#include "llvm/IR/GlobalValue.h" -#include "llvm/IR/IRBuilder.h" -#include "llvm/IR/InstVisitor.h" - -using namespace llvm; - -namespace { -class R600TextureIntrinsicsReplacer : - public FunctionPass, public InstVisitor { - static char ID; - - Module *Mod; - Type *FloatType; - Type *Int32Type; - Type *V4f32Type; - Type *V4i32Type; - FunctionType *TexSign; - FunctionType *TexQSign; - - void getAdjustmentFromTextureTarget(unsigned TextureType, bool hasLOD, - unsigned SrcSelect[4], unsigned CT[4], - bool &useShadowVariant) { - enum TextureTypes { - TEXTURE_1D = 1, - TEXTURE_2D, - TEXTURE_3D, - TEXTURE_CUBE, - TEXTURE_RECT, - TEXTURE_SHADOW1D, - TEXTURE_SHADOW2D, - TEXTURE_SHADOWRECT, - TEXTURE_1D_ARRAY, - TEXTURE_2D_ARRAY, - TEXTURE_SHADOW1D_ARRAY, - TEXTURE_SHADOW2D_ARRAY, - TEXTURE_SHADOWCUBE, - TEXTURE_2D_MSAA, - TEXTURE_2D_ARRAY_MSAA, - TEXTURE_CUBE_ARRAY, - TEXTURE_SHADOWCUBE_ARRAY - }; - - switch (TextureType) { - case 0: - useShadowVariant = false; - return; - case TEXTURE_RECT: - case TEXTURE_1D: - case TEXTURE_2D: - case TEXTURE_3D: - case TEXTURE_CUBE: - case TEXTURE_1D_ARRAY: - case TEXTURE_2D_ARRAY: - case TEXTURE_CUBE_ARRAY: - case TEXTURE_2D_MSAA: - case TEXTURE_2D_ARRAY_MSAA: - useShadowVariant = false; - break; - case TEXTURE_SHADOW1D: - case TEXTURE_SHADOW2D: - case TEXTURE_SHADOWRECT: - case TEXTURE_SHADOW1D_ARRAY: - case TEXTURE_SHADOW2D_ARRAY: - case TEXTURE_SHADOWCUBE: - case TEXTURE_SHADOWCUBE_ARRAY: - useShadowVariant = true; - break; - default: - llvm_unreachable("Unknow Texture Type"); - } - - if (TextureType == TEXTURE_RECT || - TextureType == TEXTURE_SHADOWRECT) { - CT[0] = 0; - CT[1] = 0; - } - - if (TextureType == TEXTURE_CUBE_ARRAY || - TextureType == TEXTURE_SHADOWCUBE_ARRAY) - CT[2] = 0; - - if (TextureType == TEXTURE_1D_ARRAY || - TextureType == TEXTURE_SHADOW1D_ARRAY) { - if (hasLOD && useShadowVariant) { - CT[1] = 0; - } else { - CT[2] = 0; - SrcSelect[2] = 1; - } - } else if (TextureType == TEXTURE_2D_ARRAY || - TextureType == TEXTURE_SHADOW2D_ARRAY) { - CT[2] = 0; - } - - if ((TextureType == TEXTURE_SHADOW1D || - TextureType == TEXTURE_SHADOW2D || - TextureType == TEXTURE_SHADOWRECT || - TextureType == TEXTURE_SHADOW1D_ARRAY) && - !(hasLOD && useShadowVariant)) - SrcSelect[3] = 2; - } - - void ReplaceCallInst(CallInst &I, FunctionType *FT, const char *Name, - unsigned SrcSelect[4], Value *Offset[3], Value *Resource, - Value *Sampler, unsigned CT[4], Value *Coord) { - IRBuilder<> Builder(&I); - Constant *Mask[] = { - ConstantInt::get(Int32Type, SrcSelect[0]), - ConstantInt::get(Int32Type, SrcSelect[1]), - ConstantInt::get(Int32Type, SrcSelect[2]), - ConstantInt::get(Int32Type, SrcSelect[3]) - }; - Value *SwizzleMask = ConstantVector::get(Mask); - Value *SwizzledCoord = - Builder.CreateShuffleVector(Coord, Coord, SwizzleMask); - - Value *Args[] = { - SwizzledCoord, - Offset[0], - Offset[1], - Offset[2], - Resource, - Sampler, - ConstantInt::get(Int32Type, CT[0]), - ConstantInt::get(Int32Type, CT[1]), - ConstantInt::get(Int32Type, CT[2]), - ConstantInt::get(Int32Type, CT[3]) - }; - - Function *F = Mod->getFunction(Name); - if (!F) { - F = Function::Create(FT, GlobalValue::ExternalLinkage, Name, Mod); - F->addFnAttr(Attribute::ReadNone); - } - I.replaceAllUsesWith(Builder.CreateCall(F, Args)); - I.eraseFromParent(); - } - - void ReplaceTexIntrinsic(CallInst &I, bool hasLOD, FunctionType *FT, - const char *VanillaInt, - const char *ShadowInt) { - Value *Coord = I.getArgOperand(0); - Value *ResourceId = I.getArgOperand(1); - Value *SamplerId = I.getArgOperand(2); - - unsigned TextureType = - cast(I.getArgOperand(3))->getZExtValue(); - - unsigned SrcSelect[4] = { 0, 1, 2, 3 }; - unsigned CT[4] = {1, 1, 1, 1}; - Value *Offset[3] = { - ConstantInt::get(Int32Type, 0), - ConstantInt::get(Int32Type, 0), - ConstantInt::get(Int32Type, 0) - }; - bool useShadowVariant; - - getAdjustmentFromTextureTarget(TextureType, hasLOD, SrcSelect, CT, - useShadowVariant); - - ReplaceCallInst(I, FT, useShadowVariant?ShadowInt:VanillaInt, SrcSelect, - Offset, ResourceId, SamplerId, CT, Coord); - } - - void ReplaceTXF(CallInst &I) { - Value *Coord = I.getArgOperand(0); - Value *ResourceId = I.getArgOperand(4); - Value *SamplerId = I.getArgOperand(5); - - unsigned TextureType = - cast(I.getArgOperand(6))->getZExtValue(); - - unsigned SrcSelect[4] = { 0, 1, 2, 3 }; - unsigned CT[4] = {1, 1, 1, 1}; - Value *Offset[3] = { - I.getArgOperand(1), - I.getArgOperand(2), - I.getArgOperand(3), - }; - bool useShadowVariant; - - getAdjustmentFromTextureTarget(TextureType, false, SrcSelect, CT, - useShadowVariant); - - ReplaceCallInst(I, TexQSign, "llvm.R600.txf", SrcSelect, - Offset, ResourceId, SamplerId, CT, Coord); - } - -public: - R600TextureIntrinsicsReplacer(): - FunctionPass(ID) { - } - - bool doInitialization(Module &M) override { - LLVMContext &Ctx = M.getContext(); - Mod = &M; - FloatType = Type::getFloatTy(Ctx); - Int32Type = Type::getInt32Ty(Ctx); - V4f32Type = VectorType::get(FloatType, 4); - V4i32Type = VectorType::get(Int32Type, 4); - Type *ArgsType[] = { - V4f32Type, - Int32Type, - Int32Type, - Int32Type, - Int32Type, - Int32Type, - Int32Type, - Int32Type, - Int32Type, - Int32Type, - }; - TexSign = FunctionType::get(V4f32Type, ArgsType, /*isVarArg=*/false); - Type *ArgsQType[] = { - V4i32Type, - Int32Type, - Int32Type, - Int32Type, - Int32Type, - Int32Type, - Int32Type, - Int32Type, - Int32Type, - Int32Type, - }; - TexQSign = FunctionType::get(V4f32Type, ArgsQType, /*isVarArg=*/false); - return false; - } - - bool runOnFunction(Function &F) override { - visit(F); - return false; - } - - const char *getPassName() const override { - return "R600 Texture Intrinsics Replacer"; - } - - void getAnalysisUsage(AnalysisUsage &AU) const override { - } - - void visitCallInst(CallInst &I) { - if (!I.getCalledFunction()) - return; - - StringRef Name = I.getCalledFunction()->getName(); - if (Name == "llvm.AMDGPU.tex") { - ReplaceTexIntrinsic(I, false, TexSign, "llvm.R600.tex", "llvm.R600.texc"); - return; - } - if (Name == "llvm.AMDGPU.txl") { - ReplaceTexIntrinsic(I, true, TexSign, "llvm.R600.txl", "llvm.R600.txlc"); - return; - } - if (Name == "llvm.AMDGPU.txb") { - ReplaceTexIntrinsic(I, true, TexSign, "llvm.R600.txb", "llvm.R600.txbc"); - return; - } - if (Name == "llvm.AMDGPU.txf") { - ReplaceTXF(I); - return; - } - if (Name == "llvm.AMDGPU.txq") { - ReplaceTexIntrinsic(I, false, TexQSign, "llvm.R600.txq", "llvm.R600.txq"); - return; - } - if (Name == "llvm.AMDGPU.ddx") { - ReplaceTexIntrinsic(I, false, TexSign, "llvm.R600.ddx", "llvm.R600.ddx"); - return; - } - if (Name == "llvm.AMDGPU.ddy") { - ReplaceTexIntrinsic(I, false, TexSign, "llvm.R600.ddy", "llvm.R600.ddy"); - return; - } - } - -}; - -char R600TextureIntrinsicsReplacer::ID = 0; - -} - -FunctionPass *llvm::createR600TextureIntrinsicsReplacer() { - return new R600TextureIntrinsicsReplacer(); -} Index: projects/clang370-import/contrib/llvm/lib/Target/R600/SIInstrInfo.td =================================================================== --- projects/clang370-import/contrib/llvm/lib/Target/R600/SIInstrInfo.td (revision 288077) +++ projects/clang370-import/contrib/llvm/lib/Target/R600/SIInstrInfo.td (nonexistent) @@ -1,2605 +0,0 @@ -//===-- SIInstrInfo.td - SI Instruction Infos -------------*- tablegen -*--===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -def isCI : Predicate<"Subtarget->getGeneration() " - ">= AMDGPUSubtarget::SEA_ISLANDS">; -def isVI : Predicate < - "Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS">, - AssemblerPredicate<"FeatureGCN3Encoding">; - -def DisableInst : Predicate <"false">, AssemblerPredicate<"FeatureDisable">; - -class vop { - field bits<9> SI3; - field bits<10> VI3; -} - -class vopc si, bits<8> vi = !add(0x40, si)> : vop { - field bits<8> SI = si; - field bits<8> VI = vi; - - field bits<9> SI3 = {0, si{7-0}}; - field bits<10> VI3 = {0, 0, vi{7-0}}; -} - -class vop1 si, bits<8> vi = si> : vop { - field bits<8> SI = si; - field bits<8> VI = vi; - - field bits<9> SI3 = {1, 1, si{6-0}}; - field bits<10> VI3 = !add(0x140, vi); -} - -class vop2 si, bits<6> vi = si> : vop { - field bits<6> SI = si; - field bits<6> VI = vi; - - field bits<9> SI3 = {1, 0, 0, si{5-0}}; - field bits<10> VI3 = {0, 1, 0, 0, vi{5-0}}; -} - -// Specify a VOP2 opcode for SI and VOP3 opcode for VI -// that doesn't have VOP2 encoding on VI -class vop23 si, bits<10> vi> : vop2 { - let VI3 = vi; -} - -class vop3 si, bits<10> vi = {0, si}> : vop { - let SI3 = si; - let VI3 = vi; -} - -class sop1 si, bits<8> vi = si> { - field bits<8> SI = si; - field bits<8> VI = vi; -} - -class sop2 si, bits<7> vi = si> { - field bits<7> SI = si; - field bits<7> VI = vi; -} - -class sopk si, bits<5> vi = si> { - field bits<5> SI = si; - field bits<5> VI = vi; -} - -// Execpt for the NONE field, this must be kept in sync with the SISubtarget enum -// in AMDGPUInstrInfo.cpp -def SISubtarget { - int NONE = -1; - int SI = 0; - int VI = 1; -} - -//===----------------------------------------------------------------------===// -// SI DAG Nodes -//===----------------------------------------------------------------------===// - -def SIload_constant : SDNode<"AMDGPUISD::LOAD_CONSTANT", - SDTypeProfile<1, 2, [SDTCisVT<0, f32>, SDTCisVT<1, v4i32>, SDTCisVT<2, i32>]>, - [SDNPMayLoad, SDNPMemOperand] ->; - -def SItbuffer_store : SDNode<"AMDGPUISD::TBUFFER_STORE_FORMAT", - SDTypeProfile<0, 13, - [SDTCisVT<0, v4i32>, // rsrc(SGPR) - SDTCisVT<1, iAny>, // vdata(VGPR) - SDTCisVT<2, i32>, // num_channels(imm) - SDTCisVT<3, i32>, // vaddr(VGPR) - SDTCisVT<4, i32>, // soffset(SGPR) - SDTCisVT<5, i32>, // inst_offset(imm) - SDTCisVT<6, i32>, // dfmt(imm) - SDTCisVT<7, i32>, // nfmt(imm) - SDTCisVT<8, i32>, // offen(imm) - SDTCisVT<9, i32>, // idxen(imm) - SDTCisVT<10, i32>, // glc(imm) - SDTCisVT<11, i32>, // slc(imm) - SDTCisVT<12, i32> // tfe(imm) - ]>, - [SDNPMayStore, SDNPMemOperand, SDNPHasChain] ->; - -def SIload_input : SDNode<"AMDGPUISD::LOAD_INPUT", - SDTypeProfile<1, 3, [SDTCisVT<0, v4f32>, SDTCisVT<1, v4i32>, SDTCisVT<2, i16>, - SDTCisVT<3, i32>]> ->; - -class SDSample : SDNode , SDTCisVT<2, v32i8>, - SDTCisVT<3, v4i32>, SDTCisVT<4, i32>]> ->; - -def SIsample : SDSample<"AMDGPUISD::SAMPLE">; -def SIsampleb : SDSample<"AMDGPUISD::SAMPLEB">; -def SIsampled : SDSample<"AMDGPUISD::SAMPLED">; -def SIsamplel : SDSample<"AMDGPUISD::SAMPLEL">; - -def SIconstdata_ptr : SDNode< - "AMDGPUISD::CONST_DATA_PTR", SDTypeProfile <1, 0, [SDTCisVT<0, i64>]> ->; - -//===----------------------------------------------------------------------===// -// SDNodes and PatFrag for local loads and stores to enable s_mov_b32 m0, -1 -// to be glued to the memory instructions. -//===----------------------------------------------------------------------===// - -def SIld_local : SDNode <"ISD::LOAD", SDTLoad, - [SDNPHasChain, SDNPMayLoad, SDNPMemOperand, SDNPInGlue] ->; - -def si_ld_local : PatFrag <(ops node:$ptr), (SIld_local node:$ptr), [{ - return isLocalLoad(cast(N)); -}]>; - -def si_load_local : PatFrag <(ops node:$ptr), (si_ld_local node:$ptr), [{ - return cast(N)->getAddressingMode() == ISD::UNINDEXED && - cast(N)->getExtensionType() == ISD::NON_EXTLOAD; -}]>; - -def si_load_local_align8 : Aligned8Bytes < - (ops node:$ptr), (si_load_local node:$ptr) ->; - -def si_sextload_local : PatFrag <(ops node:$ptr), (si_ld_local node:$ptr), [{ - return cast(N)->getExtensionType() == ISD::SEXTLOAD; -}]>; -def si_az_extload_local : AZExtLoadBase ; - -multiclass SIExtLoadLocal { - - def _i8 : PatFrag <(ops node:$ptr), (ld_node node:$ptr), - [{return cast(N)->getMemoryVT() == MVT::i8;}] - >; - - def _i16 : PatFrag <(ops node:$ptr), (ld_node node:$ptr), - [{return cast(N)->getMemoryVT() == MVT::i16;}] - >; -} - -defm si_sextload_local : SIExtLoadLocal ; -defm si_az_extload_local : SIExtLoadLocal ; - -def SIst_local : SDNode <"ISD::STORE", SDTStore, - [SDNPHasChain, SDNPMayStore, SDNPMemOperand, SDNPInGlue] ->; - -def si_st_local : PatFrag < - (ops node:$val, node:$ptr), (SIst_local node:$val, node:$ptr), [{ - return isLocalStore(cast(N)); -}]>; - -def si_store_local : PatFrag < - (ops node:$val, node:$ptr), (si_st_local node:$val, node:$ptr), [{ - return cast(N)->getAddressingMode() == ISD::UNINDEXED && - !cast(N)->isTruncatingStore(); -}]>; - -def si_store_local_align8 : Aligned8Bytes < - (ops node:$val, node:$ptr), (si_store_local node:$val, node:$ptr) ->; - -def si_truncstore_local : PatFrag < - (ops node:$val, node:$ptr), (si_st_local node:$val, node:$ptr), [{ - return cast(N)->isTruncatingStore(); -}]>; - -def si_truncstore_local_i8 : PatFrag < - (ops node:$val, node:$ptr), (si_truncstore_local node:$val, node:$ptr), [{ - return cast(N)->getMemoryVT() == MVT::i8; -}]>; - -def si_truncstore_local_i16 : PatFrag < - (ops node:$val, node:$ptr), (si_truncstore_local node:$val, node:$ptr), [{ - return cast(N)->getMemoryVT() == MVT::i16; -}]>; - -multiclass SIAtomicM0Glue2 { - - def _glue : SDNode <"ISD::ATOMIC_"#op_name, SDTAtomic2, - [SDNPHasChain, SDNPMayStore, SDNPMayLoad, SDNPMemOperand, SDNPInGlue] - >; - - def _local : local_binary_atomic_op (NAME#"_glue")>; -} - -defm si_atomic_load_add : SIAtomicM0Glue2 <"LOAD_ADD">; -defm si_atomic_load_and : SIAtomicM0Glue2 <"LOAD_AND">; -defm si_atomic_load_min : SIAtomicM0Glue2 <"LOAD_MIN">; -defm si_atomic_load_max : SIAtomicM0Glue2 <"LOAD_MAX">; -defm si_atomic_load_or : SIAtomicM0Glue2 <"LOAD_OR">; -defm si_atomic_load_sub : SIAtomicM0Glue2 <"LOAD_SUB">; -defm si_atomic_load_xor : SIAtomicM0Glue2 <"LOAD_XOR">; -defm si_atomic_load_umin : SIAtomicM0Glue2 <"LOAD_UMIN">; -defm si_atomic_load_umax : SIAtomicM0Glue2 <"LOAD_UMAX">; -defm si_atomic_swap : SIAtomicM0Glue2 <"SWAP">; - -def si_atomic_cmp_swap_glue : SDNode <"ISD::ATOMIC_CMP_SWAP", SDTAtomic3, - [SDNPHasChain, SDNPMayStore, SDNPMayLoad, SDNPMemOperand, SDNPInGlue] ->; - -defm si_atomic_cmp_swap : AtomicCmpSwapLocal ; - -// Transformation function, extract the lower 32bit of a 64bit immediate -def LO32 : SDNodeXFormgetTargetConstant(N->getZExtValue() & 0xffffffff, SDLoc(N), - MVT::i32); -}]>; - -def LO32f : SDNodeXFormgetValueAPF().bitcastToAPInt().trunc(32); - return CurDAG->getTargetConstantFP(APFloat(APFloat::IEEEsingle, V), MVT::f32); -}]>; - -// Transformation function, extract the upper 32bit of a 64bit immediate -def HI32 : SDNodeXFormgetTargetConstant(N->getZExtValue() >> 32, SDLoc(N), MVT::i32); -}]>; - -def HI32f : SDNodeXFormgetValueAPF().bitcastToAPInt().lshr(32).trunc(32); - return CurDAG->getTargetConstantFP(APFloat(APFloat::IEEEsingle, V), SDLoc(N), - MVT::f32); -}]>; - -def IMM8bitDWORD : PatLeaf <(imm), - [{return (N->getZExtValue() & ~0x3FC) == 0;}] ->; - -def as_dword_i32imm : SDNodeXFormgetTargetConstant(N->getZExtValue() >> 2, SDLoc(N), MVT::i32); -}]>; - -def as_i1imm : SDNodeXFormgetTargetConstant(N->getZExtValue(), SDLoc(N), MVT::i1); -}]>; - -def as_i8imm : SDNodeXFormgetTargetConstant(N->getZExtValue(), SDLoc(N), MVT::i8); -}]>; - -def as_i16imm : SDNodeXFormgetTargetConstant(N->getSExtValue(), SDLoc(N), MVT::i16); -}]>; - -def as_i32imm: SDNodeXFormgetTargetConstant(N->getSExtValue(), SDLoc(N), MVT::i32); -}]>; - -def as_i64imm: SDNodeXFormgetTargetConstant(N->getSExtValue(), SDLoc(N), MVT::i64); -}]>; - -// Copied from the AArch64 backend: -def bitcast_fpimm_to_i32 : SDNodeXFormgetTargetConstant( - N->getValueAPF().bitcastToAPInt().getZExtValue(), SDLoc(N), MVT::i32); -}]>; - -// Copied from the AArch64 backend: -def bitcast_fpimm_to_i64 : SDNodeXFormgetTargetConstant( - N->getValueAPF().bitcastToAPInt().getZExtValue(), SDLoc(N), MVT::i64); -}]>; - -def IMM8bit : PatLeaf <(imm), - [{return isUInt<8>(N->getZExtValue());}] ->; - -def IMM12bit : PatLeaf <(imm), - [{return isUInt<12>(N->getZExtValue());}] ->; - -def IMM16bit : PatLeaf <(imm), - [{return isUInt<16>(N->getZExtValue());}] ->; - -def IMM20bit : PatLeaf <(imm), - [{return isUInt<20>(N->getZExtValue());}] ->; - -def IMM32bit : PatLeaf <(imm), - [{return isUInt<32>(N->getZExtValue());}] ->; - -def mubuf_vaddr_offset : PatFrag< - (ops node:$ptr, node:$offset, node:$imm_offset), - (add (add node:$ptr, node:$offset), node:$imm_offset) ->; - -class InlineImm : PatLeaf <(vt imm), [{ - return isInlineImmediate(N); -}]>; - -class InlineFPImm : PatLeaf <(vt fpimm), [{ - return isInlineImmediate(N); -}]>; - -class SGPRImm : PatLeafgetGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS) { - return false; - } - const SIRegisterInfo *SIRI = - static_cast(Subtarget->getRegisterInfo()); - for (SDNode::use_iterator U = N->use_begin(), E = SDNode::use_end(); - U != E; ++U) { - if (SIRI->isSGPRClass(getOperandRegClass(*U, U.getOperandNo()))) { - return true; - } - } - return false; -}]>; - -//===----------------------------------------------------------------------===// -// Custom Operands -//===----------------------------------------------------------------------===// - -def FRAMEri32 : Operand { - let MIOperandInfo = (ops i32:$ptr, i32imm:$index); -} - -def SoppBrTarget : AsmOperandClass { - let Name = "SoppBrTarget"; - let ParserMethod = "parseSOppBrTarget"; -} - -def sopp_brtarget : Operand { - let EncoderMethod = "getSOPPBrEncoding"; - let OperandType = "OPERAND_PCREL"; - let ParserMatchClass = SoppBrTarget; -} - -include "SIInstrFormats.td" -include "VIInstrFormats.td" - -def MubufOffsetMatchClass : AsmOperandClass { - let Name = "MubufOffset"; - let ParserMethod = "parseMubufOptionalOps"; - let RenderMethod = "addImmOperands"; -} - -class DSOffsetBaseMatchClass : AsmOperandClass { - let Name = "DSOffset"#parser; - let ParserMethod = parser; - let RenderMethod = "addImmOperands"; - let PredicateMethod = "isDSOffset"; -} - -def DSOffsetMatchClass : DSOffsetBaseMatchClass <"parseDSOptionalOps">; -def DSOffsetGDSMatchClass : DSOffsetBaseMatchClass <"parseDSOffsetOptional">; - -def DSOffset01MatchClass : AsmOperandClass { - let Name = "DSOffset1"; - let ParserMethod = "parseDSOff01OptionalOps"; - let RenderMethod = "addImmOperands"; - let PredicateMethod = "isDSOffset01"; -} - -class GDSBaseMatchClass : AsmOperandClass { - let Name = "GDS"#parser; - let PredicateMethod = "isImm"; - let ParserMethod = parser; - let RenderMethod = "addImmOperands"; -} - -def GDSMatchClass : GDSBaseMatchClass <"parseDSOptionalOps">; -def GDS01MatchClass : GDSBaseMatchClass <"parseDSOff01OptionalOps">; - -def GLCMatchClass : AsmOperandClass { - let Name = "GLC"; - let PredicateMethod = "isImm"; - let ParserMethod = "parseMubufOptionalOps"; - let RenderMethod = "addImmOperands"; -} - -def SLCMatchClass : AsmOperandClass { - let Name = "SLC"; - let PredicateMethod = "isImm"; - let ParserMethod = "parseMubufOptionalOps"; - let RenderMethod = "addImmOperands"; -} - -def TFEMatchClass : AsmOperandClass { - let Name = "TFE"; - let PredicateMethod = "isImm"; - let ParserMethod = "parseMubufOptionalOps"; - let RenderMethod = "addImmOperands"; -} - -def OModMatchClass : AsmOperandClass { - let Name = "OMod"; - let PredicateMethod = "isImm"; - let ParserMethod = "parseVOP3OptionalOps"; - let RenderMethod = "addImmOperands"; -} - -def ClampMatchClass : AsmOperandClass { - let Name = "Clamp"; - let PredicateMethod = "isImm"; - let ParserMethod = "parseVOP3OptionalOps"; - let RenderMethod = "addImmOperands"; -} - -let OperandType = "OPERAND_IMMEDIATE" in { - -def offen : Operand { - let PrintMethod = "printOffen"; -} -def idxen : Operand { - let PrintMethod = "printIdxen"; -} -def addr64 : Operand { - let PrintMethod = "printAddr64"; -} -def mbuf_offset : Operand { - let PrintMethod = "printMBUFOffset"; - let ParserMatchClass = MubufOffsetMatchClass; -} -class ds_offset_base : Operand { - let PrintMethod = "printDSOffset"; - let ParserMatchClass = mc; -} -def ds_offset : ds_offset_base ; -def ds_offset_gds : ds_offset_base ; - -def ds_offset0 : Operand { - let PrintMethod = "printDSOffset0"; - let ParserMatchClass = DSOffset01MatchClass; -} -def ds_offset1 : Operand { - let PrintMethod = "printDSOffset1"; - let ParserMatchClass = DSOffset01MatchClass; -} -class gds_base : Operand { - let PrintMethod = "printGDS"; - let ParserMatchClass = mc; -} -def gds : gds_base ; - -def gds01 : gds_base ; - -def glc : Operand { - let PrintMethod = "printGLC"; - let ParserMatchClass = GLCMatchClass; -} -def slc : Operand { - let PrintMethod = "printSLC"; - let ParserMatchClass = SLCMatchClass; -} -def tfe : Operand { - let PrintMethod = "printTFE"; - let ParserMatchClass = TFEMatchClass; -} - -def omod : Operand { - let PrintMethod = "printOModSI"; - let ParserMatchClass = OModMatchClass; -} - -def ClampMod : Operand { - let PrintMethod = "printClampSI"; - let ParserMatchClass = ClampMatchClass; -} - -} // End OperandType = "OPERAND_IMMEDIATE" - -def VOPDstS64 : VOPDstOperand ; - -//===----------------------------------------------------------------------===// -// Complex patterns -//===----------------------------------------------------------------------===// - -def DS1Addr1Offset : ComplexPattern; -def DS64Bit4ByteAligned : ComplexPattern; - -def MUBUFAddr32 : ComplexPattern; -def MUBUFAddr64 : ComplexPattern; -def MUBUFAddr64Atomic : ComplexPattern; -def MUBUFScratch : ComplexPattern; -def MUBUFOffset : ComplexPattern; -def MUBUFOffsetAtomic : ComplexPattern; - -def VOP3Mods0 : ComplexPattern; -def VOP3Mods0Clamp : ComplexPattern; -def VOP3Mods0Clamp0OMod : ComplexPattern; -def VOP3Mods : ComplexPattern; - -//===----------------------------------------------------------------------===// -// SI assembler operands -//===----------------------------------------------------------------------===// - -def SIOperand { - int ZERO = 0x80; - int VCC = 0x6A; - int FLAT_SCR = 0x68; -} - -def SRCMODS { - int NONE = 0; - int NEG = 1; -} - -def DSTCLAMP { - int NONE = 0; -} - -def DSTOMOD { - int NONE = 0; -} - -//===----------------------------------------------------------------------===// -// -// SI Instruction multiclass helpers. -// -// Instructions with _32 take 32-bit operands. -// Instructions with _64 take 64-bit operands. -// -// VOP_* instructions can use either a 32-bit or 64-bit encoding. The 32-bit -// encoding is the standard encoding, but instruction that make use of -// any of the instruction modifiers must use the 64-bit encoding. -// -// Instructions with _e32 use the 32-bit encoding. -// Instructions with _e64 use the 64-bit encoding. -// -//===----------------------------------------------------------------------===// - -class SIMCInstr { - string PseudoInstr = pseudo; - int Subtarget = subtarget; -} - -//===----------------------------------------------------------------------===// -// EXP classes -//===----------------------------------------------------------------------===// - -class EXPCommon : InstSI< - (outs), - (ins i32imm:$en, i32imm:$tgt, i32imm:$compr, i32imm:$done, i32imm:$vm, - VGPR_32:$src0, VGPR_32:$src1, VGPR_32:$src2, VGPR_32:$src3), - "exp $en, $tgt, $compr, $done, $vm, $src0, $src1, $src2, $src3", - [] > { - - let EXP_CNT = 1; - let Uses = [EXEC]; -} - -multiclass EXP_m { - - let isPseudo = 1, isCodeGenOnly = 1 in { - def "" : EXPCommon, SIMCInstr <"exp", SISubtarget.NONE> ; - } - - def _si : EXPCommon, SIMCInstr <"exp", SISubtarget.SI>, EXPe; - - def _vi : EXPCommon, SIMCInstr <"exp", SISubtarget.VI>, EXPe_vi; -} - -//===----------------------------------------------------------------------===// -// Scalar classes -//===----------------------------------------------------------------------===// - -class SOP1_Pseudo pattern> : - SOP1 , - SIMCInstr { - let isPseudo = 1; - let isCodeGenOnly = 1; -} - -class SOP1_Real_si : - SOP1 , - SOP1e , - SIMCInstr { - let isCodeGenOnly = 0; - let AssemblerPredicates = [isSICI]; -} - -class SOP1_Real_vi : - SOP1 , - SOP1e , - SIMCInstr { - let isCodeGenOnly = 0; - let AssemblerPredicates = [isVI]; -} - -multiclass SOP1_m pattern> { - - def "" : SOP1_Pseudo ; - - def _si : SOP1_Real_si ; - - def _vi : SOP1_Real_vi ; - -} - -multiclass SOP1_32 pattern> : SOP1_m < - op, opName, (outs SReg_32:$dst), (ins SSrc_32:$src0), - opName#" $dst, $src0", pattern ->; - -multiclass SOP1_64 pattern> : SOP1_m < - op, opName, (outs SReg_64:$dst), (ins SSrc_64:$src0), - opName#" $dst, $src0", pattern ->; - -// no input, 64-bit output. -multiclass SOP1_64_0 pattern> { - def "" : SOP1_Pseudo ; - - def _si : SOP1_Real_si { - let ssrc0 = 0; - } - - def _vi : SOP1_Real_vi { - let ssrc0 = 0; - } -} - -// 64-bit input, no output -multiclass SOP1_1 pattern> { - def "" : SOP1_Pseudo ; - - def _si : SOP1_Real_si { - let sdst = 0; - } - - def _vi : SOP1_Real_vi { - let sdst = 0; - } -} - -// 64-bit input, 32-bit output. -multiclass SOP1_32_64 pattern> : SOP1_m < - op, opName, (outs SReg_32:$dst), (ins SSrc_64:$src0), - opName#" $dst, $src0", pattern ->; - -class SOP2_Pseudo pattern> : - SOP2, - SIMCInstr { - let isPseudo = 1; - let isCodeGenOnly = 1; - let Size = 4; - - // Pseudo instructions have no encodings, but adding this field here allows - // us to do: - // let sdst = xxx in { - // for multiclasses that include both real and pseudo instructions. - field bits<7> sdst = 0; -} - -class SOP2_Real_si : - SOP2, - SOP2e, - SIMCInstr { - let AssemblerPredicates = [isSICI]; -} - -class SOP2_Real_vi : - SOP2, - SOP2e, - SIMCInstr { - let AssemblerPredicates = [isVI]; -} - -multiclass SOP2_SELECT_32 pattern> { - def "" : SOP2_Pseudo ; - - def _si : SOP2_Real_si ; - - def _vi : SOP2_Real_vi ; -} - -multiclass SOP2_m pattern> { - - def "" : SOP2_Pseudo ; - - def _si : SOP2_Real_si ; - - def _vi : SOP2_Real_vi ; - -} - -multiclass SOP2_32 pattern> : SOP2_m < - op, opName, (outs SReg_32:$dst), (ins SSrc_32:$src0, SSrc_32:$src1), - opName#" $dst, $src0, $src1", pattern ->; - -multiclass SOP2_64 pattern> : SOP2_m < - op, opName, (outs SReg_64:$dst), (ins SSrc_64:$src0, SSrc_64:$src1), - opName#" $dst, $src0, $src1", pattern ->; - -multiclass SOP2_64_32 pattern> : SOP2_m < - op, opName, (outs SReg_64:$dst), (ins SSrc_64:$src0, SSrc_32:$src1), - opName#" $dst, $src0, $src1", pattern ->; - -class SOPC_Helper op, RegisterOperand rc, ValueType vt, - string opName, PatLeaf cond> : SOPC < - op, (outs SCCReg:$dst), (ins rc:$src0, rc:$src1), - opName#" $src0, $src1", []>; - -class SOPC_32 op, string opName, PatLeaf cond = COND_NULL> - : SOPC_Helper; - -class SOPC_64 op, string opName, PatLeaf cond = COND_NULL> - : SOPC_Helper; - -class SOPK_Pseudo pattern> : - SOPK , - SIMCInstr { - let isPseudo = 1; - let isCodeGenOnly = 1; -} - -class SOPK_Real_si : - SOPK , - SOPKe , - SIMCInstr { - let AssemblerPredicates = [isSICI]; - let isCodeGenOnly = 0; -} - -class SOPK_Real_vi : - SOPK , - SOPKe , - SIMCInstr { - let AssemblerPredicates = [isVI]; - let isCodeGenOnly = 0; -} - -multiclass SOPK_m { - def "" : SOPK_Pseudo ; - - def _si : SOPK_Real_si ; - - def _vi : SOPK_Real_vi ; - -} - -multiclass SOPK_32 pattern> { - def "" : SOPK_Pseudo ; - - def _si : SOPK_Real_si ; - - def _vi : SOPK_Real_vi ; -} - -multiclass SOPK_SCC pattern> { - def "" : SOPK_Pseudo ; - - let DisableEncoding = "$dst" in { - def _si : SOPK_Real_si ; - - def _vi : SOPK_Real_vi ; - } -} - -multiclass SOPK_32TIE pattern> : SOPK_m < - op, opName, (outs SReg_32:$sdst), (ins SReg_32:$src0, u16imm:$simm16), - " $sdst, $simm16" ->; - -multiclass SOPK_IMM32 { - - def "" : SOPK_Pseudo ; - - def _si : SOPK , - SOPK64e , - SIMCInstr { - let AssemblerPredicates = [isSICI]; - let isCodeGenOnly = 0; - } - - def _vi : SOPK , - SOPK64e , - SIMCInstr { - let AssemblerPredicates = [isVI]; - let isCodeGenOnly = 0; - } -} -//===----------------------------------------------------------------------===// -// SMRD classes -//===----------------------------------------------------------------------===// - -class SMRD_Pseudo pattern> : - SMRD , - SIMCInstr { - let isPseudo = 1; - let isCodeGenOnly = 1; -} - -class SMRD_Real_si op, string opName, bit imm, dag outs, dag ins, - string asm> : - SMRD , - SMRDe , - SIMCInstr { - let AssemblerPredicates = [isSICI]; -} - -class SMRD_Real_vi op, string opName, bit imm, dag outs, dag ins, - string asm> : - SMRD , - SMEMe_vi , - SIMCInstr { - let AssemblerPredicates = [isVI]; -} - -multiclass SMRD_m op, string opName, bit imm, dag outs, dag ins, - string asm, list pattern> { - - def "" : SMRD_Pseudo ; - - def _si : SMRD_Real_si ; - - // glc is only applicable to scalar stores, which are not yet - // implemented. - let glc = 0 in { - def _vi : SMRD_Real_vi <{0, 0, 0, op}, opName, imm, outs, ins, asm>; - } -} - -multiclass SMRD_Helper op, string opName, RegisterClass baseClass, - RegisterClass dstClass> { - defm _IMM : SMRD_m < - op, opName#"_IMM", 1, (outs dstClass:$dst), - (ins baseClass:$sbase, u32imm:$offset), - opName#" $dst, $sbase, $offset", [] - >; - - defm _SGPR : SMRD_m < - op, opName#"_SGPR", 0, (outs dstClass:$dst), - (ins baseClass:$sbase, SReg_32:$soff), - opName#" $dst, $sbase, $soff", [] - >; -} - -//===----------------------------------------------------------------------===// -// Vector ALU classes -//===----------------------------------------------------------------------===// - -// This must always be right before the operand being input modified. -def InputMods : OperandWithDefaultOps { - let PrintMethod = "printOperandAndMods"; -} - -def InputModsMatchClass : AsmOperandClass { - let Name = "RegWithInputMods"; -} - -def InputModsNoDefault : Operand { - let PrintMethod = "printOperandAndMods"; - let ParserMatchClass = InputModsMatchClass; -} - -class getNumSrcArgs { - int ret = - !if (!eq(Src1.Value, untyped.Value), 1, // VOP1 - !if (!eq(Src2.Value, untyped.Value), 2, // VOP2 - 3)); // VOP3 -} - -// Returns the register class to use for the destination of VOP[123C] -// instructions for the given VT. -class getVALUDstForVT { - RegisterOperand ret = !if(!eq(VT.Size, 32), VOPDstOperand, - !if(!eq(VT.Size, 64), VOPDstOperand, - VOPDstOperand)); // else VT == i1 -} - -// Returns the register class to use for source 0 of VOP[12C] -// instructions for the given VT. -class getVOPSrc0ForVT { - RegisterOperand ret = !if(!eq(VT.Size, 32), VSrc_32, VSrc_64); -} - -// Returns the register class to use for source 1 of VOP[12C] for the -// given VT. -class getVOPSrc1ForVT { - RegisterClass ret = !if(!eq(VT.Size, 32), VGPR_32, VReg_64); -} - -// Returns the register class to use for sources of VOP3 instructions for the -// given VT. -class getVOP3SrcForVT { - RegisterOperand ret = !if(!eq(VT.Size, 32), VCSrc_32, VCSrc_64); -} - -// Returns 1 if the source arguments have modifiers, 0 if they do not. -class hasModifiers { - bit ret = !if(!eq(SrcVT.Value, f32.Value), 1, - !if(!eq(SrcVT.Value, f64.Value), 1, 0)); -} - -// Returns the input arguments for VOP[12C] instructions for the given SrcVT. -class getIns32 { - dag ret = !if(!eq(NumSrcArgs, 1), (ins Src0RC:$src0), // VOP1 - !if(!eq(NumSrcArgs, 2), (ins Src0RC:$src0, Src1RC:$src1), // VOP2 - (ins))); -} - -// Returns the input arguments for VOP3 instructions for the given SrcVT. -class getIns64 { - - dag ret = - !if (!eq(NumSrcArgs, 1), - !if (!eq(HasModifiers, 1), - // VOP1 with modifiers - (ins InputModsNoDefault:$src0_modifiers, Src0RC:$src0, - ClampMod:$clamp, omod:$omod) - /* else */, - // VOP1 without modifiers - (ins Src0RC:$src0) - /* endif */ ), - !if (!eq(NumSrcArgs, 2), - !if (!eq(HasModifiers, 1), - // VOP 2 with modifiers - (ins InputModsNoDefault:$src0_modifiers, Src0RC:$src0, - InputModsNoDefault:$src1_modifiers, Src1RC:$src1, - ClampMod:$clamp, omod:$omod) - /* else */, - // VOP2 without modifiers - (ins Src0RC:$src0, Src1RC:$src1) - /* endif */ ) - /* NumSrcArgs == 3 */, - !if (!eq(HasModifiers, 1), - // VOP3 with modifiers - (ins InputModsNoDefault:$src0_modifiers, Src0RC:$src0, - InputModsNoDefault:$src1_modifiers, Src1RC:$src1, - InputModsNoDefault:$src2_modifiers, Src2RC:$src2, - ClampMod:$clamp, omod:$omod) - /* else */, - // VOP3 without modifiers - (ins Src0RC:$src0, Src1RC:$src1, Src2RC:$src2) - /* endif */ ))); -} - -// Returns the assembly string for the inputs and outputs of a VOP[12C] -// instruction. This does not add the _e32 suffix, so it can be reused -// by getAsm64. -class getAsm32 { - string src1 = ", $src1"; - string src2 = ", $src2"; - string ret = "$dst, $src0"# - !if(!eq(NumSrcArgs, 1), "", src1)# - !if(!eq(NumSrcArgs, 3), src2, ""); -} - -// Returns the assembly string for the inputs and outputs of a VOP3 -// instruction. -class getAsm64 { - string src0 = !if(!eq(NumSrcArgs, 1), "$src0_modifiers", "$src0_modifiers,"); - string src1 = !if(!eq(NumSrcArgs, 1), "", - !if(!eq(NumSrcArgs, 2), " $src1_modifiers", - " $src1_modifiers,")); - string src2 = !if(!eq(NumSrcArgs, 3), " $src2_modifiers", ""); - string ret = - !if(!eq(HasModifiers, 0), - getAsm32.ret, - "$dst, "#src0#src1#src2#"$clamp"#"$omod"); -} - - -class VOPProfile _ArgVT> { - - field list ArgVT = _ArgVT; - - field ValueType DstVT = ArgVT[0]; - field ValueType Src0VT = ArgVT[1]; - field ValueType Src1VT = ArgVT[2]; - field ValueType Src2VT = ArgVT[3]; - field RegisterOperand DstRC = getVALUDstForVT.ret; - field RegisterOperand Src0RC32 = getVOPSrc0ForVT.ret; - field RegisterClass Src1RC32 = getVOPSrc1ForVT.ret; - field RegisterOperand Src0RC64 = getVOP3SrcForVT.ret; - field RegisterOperand Src1RC64 = getVOP3SrcForVT.ret; - field RegisterOperand Src2RC64 = getVOP3SrcForVT.ret; - - field int NumSrcArgs = getNumSrcArgs.ret; - field bit HasModifiers = hasModifiers.ret; - - field dag Outs = (outs DstRC:$dst); - - field dag Ins32 = getIns32.ret; - field dag Ins64 = getIns64.ret; - - field string Asm32 = getAsm32.ret; - field string Asm64 = getAsm64.ret; -} - -// FIXME: I think these F16/I16 profiles will need to use f16/i16 types in order -// for the instruction patterns to work. -def VOP_F16_F16 : VOPProfile <[f32, f32, untyped, untyped]>; -def VOP_F16_I16 : VOPProfile <[f32, i32, untyped, untyped]>; -def VOP_I16_F16 : VOPProfile <[i32, f32, untyped, untyped]>; - -def VOP_F16_F16_F16 : VOPProfile <[f32, f32, f32, untyped]>; -def VOP_F16_F16_I16 : VOPProfile <[f32, f32, i32, untyped]>; -def VOP_I16_I16_I16 : VOPProfile <[i32, i32, i32, untyped]>; - -def VOP_F32_F32 : VOPProfile <[f32, f32, untyped, untyped]>; -def VOP_F32_F64 : VOPProfile <[f32, f64, untyped, untyped]>; -def VOP_F32_I32 : VOPProfile <[f32, i32, untyped, untyped]>; -def VOP_F64_F32 : VOPProfile <[f64, f32, untyped, untyped]>; -def VOP_F64_F64 : VOPProfile <[f64, f64, untyped, untyped]>; -def VOP_F64_I32 : VOPProfile <[f64, i32, untyped, untyped]>; -def VOP_I32_F32 : VOPProfile <[i32, f32, untyped, untyped]>; -def VOP_I32_F64 : VOPProfile <[i32, f64, untyped, untyped]>; -def VOP_I32_I32 : VOPProfile <[i32, i32, untyped, untyped]>; - -def VOP_F32_F32_F32 : VOPProfile <[f32, f32, f32, untyped]>; -def VOP_F32_F32_I32 : VOPProfile <[f32, f32, i32, untyped]>; -def VOP_F64_F64_F64 : VOPProfile <[f64, f64, f64, untyped]>; -def VOP_F64_F64_I32 : VOPProfile <[f64, f64, i32, untyped]>; -def VOP_I32_F32_F32 : VOPProfile <[i32, f32, f32, untyped]>; -def VOP_I32_F32_I32 : VOPProfile <[i32, f32, i32, untyped]>; -def VOP_I32_I32_I32 : VOPProfile <[i32, i32, i32, untyped]>; -def VOP_I32_I32_I32_VCC : VOPProfile <[i32, i32, i32, untyped]> { - let Src0RC32 = VCSrc_32; -} - -def VOP_I1_F32_I32 : VOPProfile <[i1, f32, i32, untyped]> { - let Ins64 = (ins InputModsNoDefault:$src0_modifiers, Src0RC64:$src0, Src1RC64:$src1); - let Asm64 = "$dst, $src0_modifiers, $src1"; -} - -def VOP_I1_F64_I32 : VOPProfile <[i1, f64, i32, untyped]> { - let Ins64 = (ins InputModsNoDefault:$src0_modifiers, Src0RC64:$src0, Src1RC64:$src1); - let Asm64 = "$dst, $src0_modifiers, $src1"; -} - -def VOP_I64_I64_I32 : VOPProfile <[i64, i64, i32, untyped]>; -def VOP_I64_I32_I64 : VOPProfile <[i64, i32, i64, untyped]>; -def VOP_I64_I64_I64 : VOPProfile <[i64, i64, i64, untyped]>; -def VOP_CNDMASK : VOPProfile <[i32, i32, i32, untyped]> { - let Ins32 = (ins Src0RC32:$src0, Src1RC32:$src1, VCCReg:$src2); - let Ins64 = (ins Src0RC64:$src0, Src1RC64:$src1, SSrc_64:$src2); - let Asm64 = "$dst, $src0, $src1, $src2"; -} - -def VOP_F32_F32_F32_F32 : VOPProfile <[f32, f32, f32, f32]>; -def VOP_MADK : VOPProfile <[f32, f32, f32, f32]> { - field dag Ins = (ins VCSrc_32:$src0, VGPR_32:$vsrc1, u32imm:$src2); - field string Asm = "$dst, $src0, $vsrc1, $src2"; -} -def VOP_F64_F64_F64_F64 : VOPProfile <[f64, f64, f64, f64]>; -def VOP_I32_I32_I32_I32 : VOPProfile <[i32, i32, i32, i32]>; -def VOP_I64_I32_I32_I64 : VOPProfile <[i64, i32, i32, i64]>; - - -class VOP { - string OpName = opName; -} - -class VOP2_REV { - string RevOp = revOp; - bit IsOrig = isOrig; -} - -class AtomicNoRet { - string NoRetOp = noRetOp; - bit IsRet = isRet; -} - -class VOP1_Pseudo pattern, string opName> : - VOP1Common , - VOP , - SIMCInstr , - MnemonicAlias { - let isPseudo = 1; - let isCodeGenOnly = 1; - - field bits<8> vdst; - field bits<9> src0; -} - -class VOP1_Real_si : - VOP1, - SIMCInstr { - let AssemblerPredicate = SIAssemblerPredicate; -} - -class VOP1_Real_vi : - VOP1, - SIMCInstr { - let AssemblerPredicates = [isVI]; -} - -multiclass VOP1_m pattern, - string opName> { - def "" : VOP1_Pseudo ; - - def _si : VOP1_Real_si ; - - def _vi : VOP1_Real_vi ; -} - -multiclass VOP1SI_m pattern, - string opName> { - def "" : VOP1_Pseudo ; - - def _si : VOP1_Real_si ; -} - -class VOP2_Pseudo pattern, string opName> : - VOP2Common , - VOP , - SIMCInstr, - MnemonicAlias { - let isPseudo = 1; - let isCodeGenOnly = 1; -} - -class VOP2_Real_si : - VOP2 , - SIMCInstr { - let AssemblerPredicates = [isSICI]; -} - -class VOP2_Real_vi : - VOP2 , - SIMCInstr { - let AssemblerPredicates = [isVI]; -} - -multiclass VOP2SI_m pattern, - string opName, string revOp> { - def "" : VOP2_Pseudo , - VOP2_REV; - - def _si : VOP2_Real_si ; -} - -multiclass VOP2_m pattern, - string opName, string revOp> { - def "" : VOP2_Pseudo , - VOP2_REV; - - def _si : VOP2_Real_si ; - - def _vi : VOP2_Real_vi ; - -} - -class VOP3DisableFields { - - bits<2> src0_modifiers = !if(HasModifiers, ?, 0); - bits<2> src1_modifiers = !if(HasModifiers, !if(HasSrc1, ?, 0), 0); - bits<2> src2_modifiers = !if(HasModifiers, !if(HasSrc2, ?, 0), 0); - bits<2> omod = !if(HasModifiers, ?, 0); - bits<1> clamp = !if(HasModifiers, ?, 0); - bits<9> src1 = !if(HasSrc1, ?, 0); - bits<9> src2 = !if(HasSrc2, ?, 0); -} - -class VOP3DisableModFields { - bits<2> src0_modifiers = !if(HasSrc0Mods, ?, 0); - bits<2> src1_modifiers = !if(HasSrc1Mods, ?, 0); - bits<2> src2_modifiers = !if(HasSrc2Mods, ?, 0); - bits<2> omod = !if(HasOutputMods, ?, 0); - bits<1> clamp = !if(HasOutputMods, ?, 0); -} - -class VOP3_Pseudo pattern, string opName> : - VOP3Common , - VOP , - SIMCInstr, - MnemonicAlias { - let isPseudo = 1; - let isCodeGenOnly = 1; -} - -class VOP3_Real_si op, dag outs, dag ins, string asm, string opName> : - VOP3Common , - VOP3e , - SIMCInstr { - let AssemblerPredicates = [isSICI]; -} - -class VOP3_Real_vi op, dag outs, dag ins, string asm, string opName> : - VOP3Common , - VOP3e_vi , - SIMCInstr { - let AssemblerPredicates = [isVI]; -} - -class VOP3b_Real_si op, dag outs, dag ins, string asm, string opName> : - VOP3Common , - VOP3be , - SIMCInstr { - let AssemblerPredicates = [isSICI]; -} - -class VOP3b_Real_vi op, dag outs, dag ins, string asm, string opName> : - VOP3Common , - VOP3be_vi , - SIMCInstr { - let AssemblerPredicates = [isVI]; -} - -multiclass VOP3_m pattern, - string opName, int NumSrcArgs, bit HasMods = 1> { - - def "" : VOP3_Pseudo ; - - def _si : VOP3_Real_si , - VOP3DisableFields; - def _vi : VOP3_Real_vi , - VOP3DisableFields; -} - -// VOP3_m without source modifiers -multiclass VOP3_m_nomods pattern, - string opName, int NumSrcArgs, bit HasMods = 1> { - - def "" : VOP3_Pseudo ; - - let src0_modifiers = 0, - src1_modifiers = 0, - src2_modifiers = 0, - clamp = 0, - omod = 0 in { - def _si : VOP3_Real_si ; - def _vi : VOP3_Real_vi ; - } -} - -multiclass VOP3_1_m pattern, string opName, bit HasMods = 1> { - - def "" : VOP3_Pseudo ; - - def _si : VOP3_Real_si , - VOP3DisableFields<0, 0, HasMods>; - - def _vi : VOP3_Real_vi , - VOP3DisableFields<0, 0, HasMods>; -} - -multiclass VOP3SI_1_m pattern, string opName, bit HasMods = 1> { - - def "" : VOP3_Pseudo ; - - def _si : VOP3_Real_si , - VOP3DisableFields<0, 0, HasMods>; - // No VI instruction. This class is for SI only. -} - -multiclass VOP3_2_m pattern, string opName, string revOp, - bit HasMods = 1, bit UseFullOp = 0> { - - def "" : VOP3_Pseudo , - VOP2_REV; - - def _si : VOP3_Real_si , - VOP3DisableFields<1, 0, HasMods>; - - def _vi : VOP3_Real_vi , - VOP3DisableFields<1, 0, HasMods>; -} - -multiclass VOP3SI_2_m pattern, string opName, string revOp, - bit HasMods = 1, bit UseFullOp = 0> { - - def "" : VOP3_Pseudo , - VOP2_REV; - - def _si : VOP3_Real_si , - VOP3DisableFields<1, 0, HasMods>; - - // No VI instruction. This class is for SI only. -} - -// XXX - Is v_div_scale_{f32|f64} only available in vop3b without -// option of implicit vcc use? -multiclass VOP3b_2_m pattern, string opName, string revOp, - bit HasMods = 1, bit UseFullOp = 0> { - def "" : VOP3_Pseudo , - VOP2_REV; - - // The VOP2 variant puts the carry out into VCC, the VOP3 variant - // can write it into any SGPR. We currently don't use the carry out, - // so for now hardcode it to VCC as well. - let sdst = SIOperand.VCC, Defs = [VCC] in { - def _si : VOP3b_Real_si , - VOP3DisableFields<1, 0, HasMods>; - - def _vi : VOP3b_Real_vi , - VOP3DisableFields<1, 0, HasMods>; - } // End sdst = SIOperand.VCC, Defs = [VCC] -} - -multiclass VOP3b_3_m pattern, string opName, string revOp, - bit HasMods = 1, bit UseFullOp = 0> { - def "" : VOP3_Pseudo ; - - - def _si : VOP3b_Real_si , - VOP3DisableFields<1, 1, HasMods>; - - def _vi : VOP3b_Real_vi , - VOP3DisableFields<1, 1, HasMods>; -} - -multiclass VOP3_C_m pattern, string opName, - bit HasMods, bit defExec, string revOp> { - - def "" : VOP3_Pseudo , - VOP2_REV; - - def _si : VOP3_Real_si , - VOP3DisableFields<1, 0, HasMods> { - let Defs = !if(defExec, [EXEC], []); - } - - def _vi : VOP3_Real_vi , - VOP3DisableFields<1, 0, HasMods> { - let Defs = !if(defExec, [EXEC], []); - } -} - -// An instruction that is VOP2 on SI and VOP3 on VI, no modifiers. -multiclass VOP2SI_3VI_m pattern = []> { - let isPseudo = 1, isCodeGenOnly = 1 in { - def "" : VOPAnyCommon , - SIMCInstr; - } - - def _si : VOP2 , - SIMCInstr { - let AssemblerPredicates = [isSICI]; - } - - def _vi : VOP3Common , - VOP3e_vi , - VOP3DisableFields <1, 0, 0>, - SIMCInstr { - let AssemblerPredicates = [isVI]; - } -} - -multiclass VOP1_Helper pat32, - dag ins64, string asm64, list pat64, - bit HasMods> { - - defm _e32 : VOP1_m ; - - defm _e64 : VOP3_1_m ; -} - -multiclass VOP1Inst : VOP1_Helper < - op, opName, P.Outs, - P.Ins32, P.Asm32, [], - P.Ins64, P.Asm64, - !if(P.HasModifiers, - [(set P.DstVT:$dst, (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, - i32:$src0_modifiers, i1:$clamp, i32:$omod))))], - [(set P.DstVT:$dst, (node P.Src0VT:$src0))]), - P.HasModifiers ->; - -multiclass VOP1InstSI { - - defm _e32 : VOP1SI_m ; - - defm _e64 : VOP3SI_1_m ; -} - -multiclass VOP2_Helper pat32, - dag ins64, string asm64, list pat64, - string revOp, bit HasMods> { - defm _e32 : VOP2_m ; - - defm _e64 : VOP3_2_m ; -} - -multiclass VOP2Inst : VOP2_Helper < - op, opName, P.Outs, - P.Ins32, P.Asm32, [], - P.Ins64, P.Asm64, - !if(P.HasModifiers, - [(set P.DstVT:$dst, - (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, - i1:$clamp, i32:$omod)), - (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers))))], - [(set P.DstVT:$dst, (node P.Src0VT:$src0, P.Src1VT:$src1))]), - revOp, P.HasModifiers ->; - -multiclass VOP2InstSI { - defm _e32 : VOP2SI_m ; - - defm _e64 : VOP3SI_2_m ; -} - -multiclass VOP2b_Helper pat32, - dag ins64, string asm64, list pat64, - string revOp, bit HasMods> { - - defm _e32 : VOP2_m ; - - defm _e64 : VOP3b_2_m ; -} - -multiclass VOP2bInst : VOP2b_Helper < - op, opName, P.Outs, - P.Ins32, P.Asm32, [], - P.Ins64, P.Asm64, - !if(P.HasModifiers, - [(set P.DstVT:$dst, - (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, - i1:$clamp, i32:$omod)), - (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers))))], - [(set P.DstVT:$dst, (node P.Src0VT:$src0, P.Src1VT:$src1))]), - revOp, P.HasModifiers ->; - -// A VOP2 instruction that is VOP3-only on VI. -multiclass VOP2_VI3_Helper pat32, - dag ins64, string asm64, list pat64, - string revOp, bit HasMods> { - defm _e32 : VOP2SI_m ; - - defm _e64 : VOP3_2_m ; -} - -multiclass VOP2_VI3_Inst - : VOP2_VI3_Helper < - op, opName, P.Outs, - P.Ins32, P.Asm32, [], - P.Ins64, P.Asm64, - !if(P.HasModifiers, - [(set P.DstVT:$dst, - (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, - i1:$clamp, i32:$omod)), - (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers))))], - [(set P.DstVT:$dst, (node P.Src0VT:$src0, P.Src1VT:$src1))]), - revOp, P.HasModifiers ->; - -multiclass VOP2MADK pattern = []> { - - def "" : VOP2_Pseudo ; - -let isCodeGenOnly = 0 in { - def _si : VOP2Common , - SIMCInstr , - VOP2_MADKe { - let AssemblerPredicates = [isSICI]; - } - - def _vi : VOP2Common , - SIMCInstr , - VOP2_MADKe { - let AssemblerPredicates = [isVI]; - } -} // End isCodeGenOnly = 0 -} - -class VOPC_Pseudo pattern, string opName> : - VOPCCommon , - VOP , - SIMCInstr, - MnemonicAlias { - let isPseudo = 1; - let isCodeGenOnly = 1; -} - -multiclass VOPC_m pattern, - string opName, bit DefExec, string revOpName = ""> { - def "" : VOPC_Pseudo ; - - def _si : VOPC, - SIMCInstr { - let Defs = !if(DefExec, [EXEC], []); - let hasSideEffects = DefExec; - } - - def _vi : VOPC, - SIMCInstr { - let Defs = !if(DefExec, [EXEC], []); - let hasSideEffects = DefExec; - } -} - -multiclass VOPC_Helper pat32, - dag out64, dag ins64, string asm64, list pat64, - bit HasMods, bit DefExec, string revOp> { - defm _e32 : VOPC_m ; - - defm _e64 : VOP3_C_m ; -} - -// Special case for class instructions which only have modifiers on -// the 1st source operand. -multiclass VOPC_Class_Helper pat32, - dag out64, dag ins64, string asm64, list pat64, - bit HasMods, bit DefExec, string revOp> { - defm _e32 : VOPC_m ; - - defm _e64 : VOP3_C_m , - VOP3DisableModFields<1, 0, 0>; -} - -multiclass VOPCInst : VOPC_Helper < - op, opName, - P.Ins32, P.Asm32, [], - (outs VOPDstS64:$dst), P.Ins64, P.Asm64, - !if(P.HasModifiers, - [(set i1:$dst, - (setcc (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, - i1:$clamp, i32:$omod)), - (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers)), - cond))], - [(set i1:$dst, (setcc P.Src0VT:$src0, P.Src1VT:$src1, cond))]), - P.HasModifiers, DefExec, revOp ->; - -multiclass VOPCClassInst : VOPC_Class_Helper < - op, opName, - P.Ins32, P.Asm32, [], - (outs VOPDstS64:$dst), P.Ins64, P.Asm64, - !if(P.HasModifiers, - [(set i1:$dst, - (AMDGPUfp_class (P.Src0VT (VOP3Mods0Clamp0OMod P.Src0VT:$src0, i32:$src0_modifiers)), P.Src1VT:$src1))], - [(set i1:$dst, (AMDGPUfp_class P.Src0VT:$src0, P.Src1VT:$src1))]), - P.HasModifiers, DefExec, opName ->; - - -multiclass VOPC_F32 : - VOPCInst ; - -multiclass VOPC_F64 : - VOPCInst ; - -multiclass VOPC_I32 : - VOPCInst ; - -multiclass VOPC_I64 : - VOPCInst ; - - -multiclass VOPCX - : VOPCInst ; - -multiclass VOPCX_F32 : - VOPCX ; - -multiclass VOPCX_F64 : - VOPCX ; - -multiclass VOPCX_I32 : - VOPCX ; - -multiclass VOPCX_I64 : - VOPCX ; - -multiclass VOP3_Helper pat, int NumSrcArgs, bit HasMods> : VOP3_m < - op, outs, ins, opName#" "#asm, pat, opName, NumSrcArgs, HasMods ->; - -multiclass VOPC_CLASS_F32 : - VOPCClassInst ; - -multiclass VOPCX_CLASS_F32 : - VOPCClassInst ; - -multiclass VOPC_CLASS_F64 : - VOPCClassInst ; - -multiclass VOPCX_CLASS_F64 : - VOPCClassInst ; - -multiclass VOP3Inst : VOP3_Helper < - op, opName, (outs P.DstRC.RegClass:$dst), P.Ins64, P.Asm64, - !if(!eq(P.NumSrcArgs, 3), - !if(P.HasModifiers, - [(set P.DstVT:$dst, - (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, - i1:$clamp, i32:$omod)), - (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers)), - (P.Src2VT (VOP3Mods P.Src2VT:$src2, i32:$src2_modifiers))))], - [(set P.DstVT:$dst, (node P.Src0VT:$src0, P.Src1VT:$src1, - P.Src2VT:$src2))]), - !if(!eq(P.NumSrcArgs, 2), - !if(P.HasModifiers, - [(set P.DstVT:$dst, - (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, - i1:$clamp, i32:$omod)), - (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers))))], - [(set P.DstVT:$dst, (node P.Src0VT:$src0, P.Src1VT:$src1))]) - /* P.NumSrcArgs == 1 */, - !if(P.HasModifiers, - [(set P.DstVT:$dst, - (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, - i1:$clamp, i32:$omod))))], - [(set P.DstVT:$dst, (node P.Src0VT:$src0))]))), - P.NumSrcArgs, P.HasModifiers ->; - -// Special case for v_div_fmas_{f32|f64}, since it seems to be the -// only VOP instruction that implicitly reads VCC. -multiclass VOP3_VCC_Inst : VOP3_Helper < - op, opName, - (outs P.DstRC.RegClass:$dst), - (ins InputModsNoDefault:$src0_modifiers, P.Src0RC64:$src0, - InputModsNoDefault:$src1_modifiers, P.Src1RC64:$src1, - InputModsNoDefault:$src2_modifiers, P.Src2RC64:$src2, - ClampMod:$clamp, - omod:$omod), - " $dst, $src0_modifiers, $src1_modifiers, $src2_modifiers"#"$clamp"#"$omod", - [(set P.DstVT:$dst, - (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, - i1:$clamp, i32:$omod)), - (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers)), - (P.Src2VT (VOP3Mods P.Src2VT:$src2, i32:$src2_modifiers)), - (i1 VCC)))], - 3, 1 ->; - -multiclass VOP3b_Helper pattern> : - VOP3b_3_m < - op, (outs vrc:$vdst, SReg_64:$sdst), - (ins InputModsNoDefault:$src0_modifiers, arc:$src0, - InputModsNoDefault:$src1_modifiers, arc:$src1, - InputModsNoDefault:$src2_modifiers, arc:$src2, - ClampMod:$clamp, omod:$omod), - opName#" $vdst, $sdst, $src0_modifiers, $src1_modifiers, $src2_modifiers"#"$clamp"#"$omod", pattern, - opName, opName, 1, 1 ->; - -multiclass VOP3b_64 pattern> : - VOP3b_Helper ; - -multiclass VOP3b_32 pattern> : - VOP3b_Helper ; - - -class Vop3ModPat : Pat< - (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod)), - (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers)), - (P.Src2VT (VOP3Mods P.Src2VT:$src2, i32:$src2_modifiers))), - (Inst i32:$src0_modifiers, P.Src0VT:$src0, - i32:$src1_modifiers, P.Src1VT:$src1, - i32:$src2_modifiers, P.Src2VT:$src2, - i1:$clamp, - i32:$omod)>; - -//===----------------------------------------------------------------------===// -// Interpolation opcodes -//===----------------------------------------------------------------------===// - -class VINTRP_Pseudo pattern> : - VINTRPCommon , - SIMCInstr { - let isPseudo = 1; - let isCodeGenOnly = 1; -} - -class VINTRP_Real_si op, string opName, dag outs, dag ins, - string asm> : - VINTRPCommon , - VINTRPe , - SIMCInstr; - -class VINTRP_Real_vi op, string opName, dag outs, dag ins, - string asm> : - VINTRPCommon , - VINTRPe_vi , - SIMCInstr; - -multiclass VINTRP_m op, dag outs, dag ins, string asm, - list pattern = []> { - def "" : VINTRP_Pseudo ; - - def _si : VINTRP_Real_si ; - - def _vi : VINTRP_Real_vi ; -} - -//===----------------------------------------------------------------------===// -// Vector I/O classes -//===----------------------------------------------------------------------===// - -class DS_Pseudo pattern> : - DS , - SIMCInstr { - let isPseudo = 1; - let isCodeGenOnly = 1; -} - -class DS_Real_si op, string opName, dag outs, dag ins, string asm> : - DS , - DSe , - SIMCInstr { - let isCodeGenOnly = 0; -} - -class DS_Real_vi op, string opName, dag outs, dag ins, string asm> : - DS , - DSe_vi , - SIMCInstr ; - -class DS_Off16_Real_si op, string opName, dag outs, dag ins, string asm> : - DS_Real_si { - - // Single load interpret the 2 i8imm operands as a single i16 offset. - bits<16> offset; - let offset0 = offset{7-0}; - let offset1 = offset{15-8}; - let isCodeGenOnly = 0; -} - -class DS_Off16_Real_vi op, string opName, dag outs, dag ins, string asm> : - DS_Real_vi { - - // Single load interpret the 2 i8imm operands as a single i16 offset. - bits<16> offset; - let offset0 = offset{7-0}; - let offset1 = offset{15-8}; -} - -multiclass DS_1A_RET op, string opName, RegisterClass rc, - dag outs = (outs rc:$vdst), - dag ins = (ins VGPR_32:$addr, ds_offset:$offset, gds:$gds), - string asm = opName#" $vdst, $addr"#"$offset$gds"> { - - def "" : DS_Pseudo ; - - let data0 = 0, data1 = 0 in { - def _si : DS_Off16_Real_si ; - def _vi : DS_Off16_Real_vi ; - } -} - -multiclass DS_1A_Off8_RET op, string opName, RegisterClass rc, - dag outs = (outs rc:$vdst), - dag ins = (ins VGPR_32:$addr, ds_offset0:$offset0, ds_offset1:$offset1, - gds01:$gds), - string asm = opName#" $vdst, $addr"#"$offset0"#"$offset1$gds"> { - - def "" : DS_Pseudo ; - - let data0 = 0, data1 = 0, AsmMatchConverter = "cvtDSOffset01" in { - def _si : DS_Real_si ; - def _vi : DS_Real_vi ; - } -} - -multiclass DS_1A1D_NORET op, string opName, RegisterClass rc, - dag outs = (outs), - dag ins = (ins VGPR_32:$addr, rc:$data0, ds_offset:$offset, gds:$gds), - string asm = opName#" $addr, $data0"#"$offset$gds"> { - - def "" : DS_Pseudo , - AtomicNoRet; - - let data1 = 0, vdst = 0 in { - def _si : DS_Off16_Real_si ; - def _vi : DS_Off16_Real_vi ; - } -} - -multiclass DS_1A1D_Off8_NORET op, string opName, RegisterClass rc, - dag outs = (outs), - dag ins = (ins VGPR_32:$addr, rc:$data0, rc:$data1, - ds_offset0:$offset0, ds_offset1:$offset1, gds01:$gds), - string asm = opName#" $addr, $data0, $data1"#"$offset0"#"$offset1"#"$gds"> { - - def "" : DS_Pseudo ; - - let vdst = 0, AsmMatchConverter = "cvtDSOffset01" in { - def _si : DS_Real_si ; - def _vi : DS_Real_vi ; - } -} - -multiclass DS_1A1D_RET op, string opName, RegisterClass rc, - string noRetOp = "", - dag outs = (outs rc:$vdst), - dag ins = (ins VGPR_32:$addr, rc:$data0, ds_offset:$offset, gds:$gds), - string asm = opName#" $vdst, $addr, $data0"#"$offset$gds"> { - - def "" : DS_Pseudo , - AtomicNoRet; - - let data1 = 0 in { - def _si : DS_Off16_Real_si ; - def _vi : DS_Off16_Real_vi ; - } -} - -multiclass DS_1A2D_RET_m op, string opName, RegisterClass rc, - string noRetOp = "", dag ins, - dag outs = (outs rc:$vdst), - string asm = opName#" $vdst, $addr, $data0, $data1"#"$offset"#"$gds"> { - - def "" : DS_Pseudo , - AtomicNoRet; - - def _si : DS_Off16_Real_si ; - def _vi : DS_Off16_Real_vi ; -} - -multiclass DS_1A2D_RET op, string asm, RegisterClass rc, - string noRetOp = "", RegisterClass src = rc> : - DS_1A2D_RET_m ; - -multiclass DS_1A2D_NORET op, string opName, RegisterClass rc, - string noRetOp = opName, - dag outs = (outs), - dag ins = (ins VGPR_32:$addr, rc:$data0, rc:$data1, - ds_offset:$offset, gds:$gds), - string asm = opName#" $addr, $data0, $data1"#"$offset"#"$gds"> { - - def "" : DS_Pseudo , - AtomicNoRet; - - let vdst = 0 in { - def _si : DS_Off16_Real_si ; - def _vi : DS_Off16_Real_vi ; - } -} - -multiclass DS_0A_RET op, string opName, - dag outs = (outs VGPR_32:$vdst), - dag ins = (ins ds_offset:$offset, gds:$gds), - string asm = opName#" $vdst"#"$offset"#"$gds"> { - - let mayLoad = 1, mayStore = 1 in { - def "" : DS_Pseudo ; - - let addr = 0, data0 = 0, data1 = 0 in { - def _si : DS_Off16_Real_si ; - def _vi : DS_Off16_Real_vi ; - } // end addr = 0, data0 = 0, data1 = 0 - } // end mayLoad = 1, mayStore = 1 -} - -multiclass DS_1A_RET_GDS op, string opName, - dag outs = (outs VGPR_32:$vdst), - dag ins = (ins VGPR_32:$addr, ds_offset_gds:$offset), - string asm = opName#" $vdst, $addr"#"$offset gds"> { - - def "" : DS_Pseudo ; - - let data0 = 0, data1 = 0, gds = 1 in { - def _si : DS_Off16_Real_si ; - def _vi : DS_Off16_Real_vi ; - } // end data0 = 0, data1 = 0, gds = 1 -} - -multiclass DS_1A_GDS op, string opName, - dag outs = (outs), - dag ins = (ins VGPR_32:$addr), - string asm = opName#" $addr gds"> { - - def "" : DS_Pseudo ; - - let vdst = 0, data0 = 0, data1 = 0, offset0 = 0, offset1 = 0, gds = 1 in { - def _si : DS_Real_si ; - def _vi : DS_Real_vi ; - } // end vdst = 0, data = 0, data1 = 0, gds = 1 -} - -multiclass DS_1A op, string opName, - dag outs = (outs), - dag ins = (ins VGPR_32:$addr, ds_offset:$offset, gds:$gds), - string asm = opName#" $addr"#"$offset"#"$gds"> { - - let mayLoad = 1, mayStore = 1 in { - def "" : DS_Pseudo ; - - let vdst = 0, data0 = 0, data1 = 0 in { - def _si : DS_Off16_Real_si ; - def _vi : DS_Off16_Real_vi ; - } // let vdst = 0, data0 = 0, data1 = 0 - } // end mayLoad = 1, mayStore = 1 -} - -//===----------------------------------------------------------------------===// -// MTBUF classes -//===----------------------------------------------------------------------===// - -class MTBUF_Pseudo pattern> : - MTBUF , - SIMCInstr { - let isPseudo = 1; - let isCodeGenOnly = 1; -} - -class MTBUF_Real_si op, string opName, dag outs, dag ins, - string asm> : - MTBUF , - MTBUFe , - SIMCInstr; - -class MTBUF_Real_vi op, string opName, dag outs, dag ins, string asm> : - MTBUF , - MTBUFe_vi , - SIMCInstr ; - -multiclass MTBUF_m op, string opName, dag outs, dag ins, string asm, - list pattern> { - - def "" : MTBUF_Pseudo ; - - def _si : MTBUF_Real_si ; - - def _vi : MTBUF_Real_vi <{0, op{2}, op{1}, op{0}}, opName, outs, ins, asm>; - -} - -let mayStore = 1, mayLoad = 0 in { - -multiclass MTBUF_Store_Helper op, string opName, - RegisterClass regClass> : MTBUF_m < - op, opName, (outs), - (ins regClass:$vdata, u16imm:$offset, i1imm:$offen, i1imm:$idxen, i1imm:$glc, - i1imm:$addr64, i8imm:$dfmt, i8imm:$nfmt, VGPR_32:$vaddr, - SReg_128:$srsrc, i1imm:$slc, i1imm:$tfe, SCSrc_32:$soffset), - opName#" $vdata, $offset, $offen, $idxen, $glc, $addr64, $dfmt," - #" $nfmt, $vaddr, $srsrc, $slc, $tfe, $soffset", [] ->; - -} // mayStore = 1, mayLoad = 0 - -let mayLoad = 1, mayStore = 0 in { - -multiclass MTBUF_Load_Helper op, string opName, - RegisterClass regClass> : MTBUF_m < - op, opName, (outs regClass:$dst), - (ins u16imm:$offset, i1imm:$offen, i1imm:$idxen, i1imm:$glc, i1imm:$addr64, - i8imm:$dfmt, i8imm:$nfmt, VGPR_32:$vaddr, SReg_128:$srsrc, - i1imm:$slc, i1imm:$tfe, SCSrc_32:$soffset), - opName#" $dst, $offset, $offen, $idxen, $glc, $addr64, $dfmt," - #" $nfmt, $vaddr, $srsrc, $slc, $tfe, $soffset", [] ->; - -} // mayLoad = 1, mayStore = 0 - -//===----------------------------------------------------------------------===// -// MUBUF classes -//===----------------------------------------------------------------------===// - -class mubuf si, bits<7> vi = si> { - field bits<7> SI = si; - field bits<7> VI = vi; -} - -let isCodeGenOnly = 0 in { - -class MUBUF_si op, dag outs, dag ins, string asm, list pattern> : - MUBUF , MUBUFe { - let lds = 0; -} - -} // End let isCodeGenOnly = 0 - -class MUBUF_vi op, dag outs, dag ins, string asm, list pattern> : - MUBUF , MUBUFe_vi { - let lds = 0; -} - -class MUBUFAddr64Table { - bit IsAddr64 = is_addr64; - string OpName = NAME # suffix; -} - -class MUBUF_Pseudo pattern> : - MUBUF , - SIMCInstr { - let isPseudo = 1; - let isCodeGenOnly = 1; - - // dummy fields, so that we can use let statements around multiclasses - bits<1> offen; - bits<1> idxen; - bits<8> vaddr; - bits<1> glc; - bits<1> slc; - bits<1> tfe; - bits<8> soffset; -} - -class MUBUF_Real_si : - MUBUF , - MUBUFe , - SIMCInstr { - let lds = 0; -} - -class MUBUF_Real_vi : - MUBUF , - MUBUFe_vi , - SIMCInstr { - let lds = 0; -} - -multiclass MUBUF_m pattern> { - - def "" : MUBUF_Pseudo , - MUBUFAddr64Table <0>; - - let addr64 = 0, isCodeGenOnly = 0 in { - def _si : MUBUF_Real_si ; - } - - def _vi : MUBUF_Real_vi ; -} - -multiclass MUBUFAddr64_m pattern> { - - def "" : MUBUF_Pseudo , - MUBUFAddr64Table <1>; - - let addr64 = 1, isCodeGenOnly = 0 in { - def _si : MUBUF_Real_si ; - } - - // There is no VI version. If the pseudo is selected, it should be lowered - // for VI appropriately. -} - -multiclass MUBUFAtomicOffset_m pattern, bit is_return> { - - def "" : MUBUF_Pseudo , - MUBUFAddr64Table <0, !if(is_return, "_RTN", "")>, - AtomicNoRet; - - let offen = 0, idxen = 0, tfe = 0, vaddr = 0 in { - let addr64 = 0 in { - def _si : MUBUF_Real_si ; - } - - def _vi : MUBUF_Real_vi ; - } -} - -multiclass MUBUFAtomicAddr64_m pattern, bit is_return> { - - def "" : MUBUF_Pseudo , - MUBUFAddr64Table <1, !if(is_return, "_RTN", "")>, - AtomicNoRet; - - let offen = 0, idxen = 0, addr64 = 1, tfe = 0 in { - def _si : MUBUF_Real_si ; - } - - // There is no VI version. If the pseudo is selected, it should be lowered - // for VI appropriately. -} - -multiclass MUBUF_Atomic { - - let mayStore = 1, mayLoad = 1, hasPostISelHook = 1 in { - - // No return variants - let glc = 0 in { - - defm _ADDR64 : MUBUFAtomicAddr64_m < - op, name#"_addr64", (outs), - (ins rc:$vdata, SReg_128:$srsrc, VReg_64:$vaddr, - SCSrc_32:$soffset, mbuf_offset:$offset, slc:$slc), - name#" $vdata, $vaddr, $srsrc, $soffset addr64"#"$offset"#"$slc", [], 0 - >; - - defm _OFFSET : MUBUFAtomicOffset_m < - op, name#"_offset", (outs), - (ins rc:$vdata, SReg_128:$srsrc, SCSrc_32:$soffset, mbuf_offset:$offset, - slc:$slc), - name#" $vdata, $srsrc, $soffset"#"$offset"#"$slc", [], 0 - >; - } // glc = 0 - - // Variant that return values - let glc = 1, Constraints = "$vdata = $vdata_in", - DisableEncoding = "$vdata_in" in { - - defm _RTN_ADDR64 : MUBUFAtomicAddr64_m < - op, name#"_rtn_addr64", (outs rc:$vdata), - (ins rc:$vdata_in, SReg_128:$srsrc, VReg_64:$vaddr, - SCSrc_32:$soffset, mbuf_offset:$offset, slc:$slc), - name#" $vdata, $vaddr, $srsrc, $soffset addr64"#"$offset"#" glc"#"$slc", - [(set vt:$vdata, - (atomic (MUBUFAddr64Atomic v4i32:$srsrc, i64:$vaddr, i32:$soffset, - i16:$offset, i1:$slc), vt:$vdata_in))], 1 - >; - - defm _RTN_OFFSET : MUBUFAtomicOffset_m < - op, name#"_rtn_offset", (outs rc:$vdata), - (ins rc:$vdata_in, SReg_128:$srsrc, SCSrc_32:$soffset, - mbuf_offset:$offset, slc:$slc), - name#" $vdata, $srsrc, $soffset"#"$offset"#" glc $slc", - [(set vt:$vdata, - (atomic (MUBUFOffsetAtomic v4i32:$srsrc, i32:$soffset, i16:$offset, - i1:$slc), vt:$vdata_in))], 1 - >; - - } // glc = 1 - - } // mayStore = 1, mayLoad = 1, hasPostISelHook = 1 -} - -multiclass MUBUF_Load_Helper { - - let mayLoad = 1, mayStore = 0 in { - let offen = 0, idxen = 0, vaddr = 0 in { - defm _OFFSET : MUBUF_m ; - } - - let offen = 1, idxen = 0 in { - defm _OFFEN : MUBUF_m ; - } - - let offen = 0, idxen = 1 in { - defm _IDXEN : MUBUF_m ; - } - - let offen = 1, idxen = 1 in { - defm _BOTHEN : MUBUF_m ; - } - - let offen = 0, idxen = 0 in { - defm _ADDR64 : MUBUFAddr64_m ; - } - } -} - -multiclass MUBUF_Store_Helper { - let mayLoad = 0, mayStore = 1 in { - defm : MUBUF_m ; - - let offen = 0, idxen = 0, vaddr = 0 in { - defm _OFFSET : MUBUF_m ; - } // offen = 0, idxen = 0, vaddr = 0 - - let offen = 1, idxen = 0 in { - defm _OFFEN : MUBUF_m ; - } // end offen = 1, idxen = 0 - - let offen = 0, idxen = 1 in { - defm _IDXEN : MUBUF_m ; - } - - let offen = 1, idxen = 1 in { - defm _BOTHEN : MUBUF_m ; - } - - let offen = 0, idxen = 0 in { - defm _ADDR64 : MUBUFAddr64_m ; - } - } // End mayLoad = 0, mayStore = 1 -} - -class FLAT_Load_Helper op, string asm, RegisterClass regClass> : - FLAT { - let glc = 0; - let slc = 0; - let tfe = 0; - let data = 0; - let mayLoad = 1; -} - -class FLAT_Store_Helper op, string name, RegisterClass vdataClass> : - FLAT { - - let mayLoad = 0; - let mayStore = 1; - - // Encoding - let glc = 0; - let slc = 0; - let tfe = 0; - let vdst = 0; -} - -class MIMG_Mask { - string Op = op; - int Channels = channels; -} - -class MIMG_NoSampler_Helper op, string asm, - RegisterClass dst_rc, - RegisterClass src_rc> : MIMG < - op, - (outs dst_rc:$vdata), - (ins i32imm:$dmask, i1imm:$unorm, i1imm:$glc, i1imm:$da, i1imm:$r128, - i1imm:$tfe, i1imm:$lwe, i1imm:$slc, src_rc:$vaddr, - SReg_256:$srsrc), - asm#" $vdata, $dmask, $unorm, $glc, $da, $r128," - #" $tfe, $lwe, $slc, $vaddr, $srsrc", - []> { - let ssamp = 0; - let mayLoad = 1; - let mayStore = 0; - let hasPostISelHook = 1; -} - -multiclass MIMG_NoSampler_Src_Helper op, string asm, - RegisterClass dst_rc, - int channels> { - def _V1 : MIMG_NoSampler_Helper , - MIMG_Mask; - def _V2 : MIMG_NoSampler_Helper , - MIMG_Mask; - def _V4 : MIMG_NoSampler_Helper , - MIMG_Mask; -} - -multiclass MIMG_NoSampler op, string asm> { - defm _V1 : MIMG_NoSampler_Src_Helper ; - defm _V2 : MIMG_NoSampler_Src_Helper ; - defm _V3 : MIMG_NoSampler_Src_Helper ; - defm _V4 : MIMG_NoSampler_Src_Helper ; -} - -class MIMG_Sampler_Helper op, string asm, - RegisterClass dst_rc, - RegisterClass src_rc, int wqm> : MIMG < - op, - (outs dst_rc:$vdata), - (ins i32imm:$dmask, i1imm:$unorm, i1imm:$glc, i1imm:$da, i1imm:$r128, - i1imm:$tfe, i1imm:$lwe, i1imm:$slc, src_rc:$vaddr, - SReg_256:$srsrc, SReg_128:$ssamp), - asm#" $vdata, $dmask, $unorm, $glc, $da, $r128," - #" $tfe, $lwe, $slc, $vaddr, $srsrc, $ssamp", - []> { - let mayLoad = 1; - let mayStore = 0; - let hasPostISelHook = 1; - let WQM = wqm; -} - -multiclass MIMG_Sampler_Src_Helper op, string asm, - RegisterClass dst_rc, - int channels, int wqm> { - def _V1 : MIMG_Sampler_Helper , - MIMG_Mask; - def _V2 : MIMG_Sampler_Helper , - MIMG_Mask; - def _V4 : MIMG_Sampler_Helper , - MIMG_Mask; - def _V8 : MIMG_Sampler_Helper , - MIMG_Mask; - def _V16 : MIMG_Sampler_Helper , - MIMG_Mask; -} - -multiclass MIMG_Sampler op, string asm> { - defm _V1 : MIMG_Sampler_Src_Helper; - defm _V2 : MIMG_Sampler_Src_Helper; - defm _V3 : MIMG_Sampler_Src_Helper; - defm _V4 : MIMG_Sampler_Src_Helper; -} - -multiclass MIMG_Sampler_WQM op, string asm> { - defm _V1 : MIMG_Sampler_Src_Helper; - defm _V2 : MIMG_Sampler_Src_Helper; - defm _V3 : MIMG_Sampler_Src_Helper; - defm _V4 : MIMG_Sampler_Src_Helper; -} - -class MIMG_Gather_Helper op, string asm, - RegisterClass dst_rc, - RegisterClass src_rc, int wqm> : MIMG < - op, - (outs dst_rc:$vdata), - (ins i32imm:$dmask, i1imm:$unorm, i1imm:$glc, i1imm:$da, i1imm:$r128, - i1imm:$tfe, i1imm:$lwe, i1imm:$slc, src_rc:$vaddr, - SReg_256:$srsrc, SReg_128:$ssamp), - asm#" $vdata, $dmask, $unorm, $glc, $da, $r128," - #" $tfe, $lwe, $slc, $vaddr, $srsrc, $ssamp", - []> { - let mayLoad = 1; - let mayStore = 0; - - // DMASK was repurposed for GATHER4. 4 components are always - // returned and DMASK works like a swizzle - it selects - // the component to fetch. The only useful DMASK values are - // 1=red, 2=green, 4=blue, 8=alpha. (e.g. 1 returns - // (red,red,red,red) etc.) The ISA document doesn't mention - // this. - // Therefore, disable all code which updates DMASK by setting these two: - let MIMG = 0; - let hasPostISelHook = 0; - let WQM = wqm; -} - -multiclass MIMG_Gather_Src_Helper op, string asm, - RegisterClass dst_rc, - int channels, int wqm> { - def _V1 : MIMG_Gather_Helper , - MIMG_Mask; - def _V2 : MIMG_Gather_Helper , - MIMG_Mask; - def _V4 : MIMG_Gather_Helper , - MIMG_Mask; - def _V8 : MIMG_Gather_Helper , - MIMG_Mask; - def _V16 : MIMG_Gather_Helper , - MIMG_Mask; -} - -multiclass MIMG_Gather op, string asm> { - defm _V1 : MIMG_Gather_Src_Helper; - defm _V2 : MIMG_Gather_Src_Helper; - defm _V3 : MIMG_Gather_Src_Helper; - defm _V4 : MIMG_Gather_Src_Helper; -} - -multiclass MIMG_Gather_WQM op, string asm> { - defm _V1 : MIMG_Gather_Src_Helper; - defm _V2 : MIMG_Gather_Src_Helper; - defm _V3 : MIMG_Gather_Src_Helper; - defm _V4 : MIMG_Gather_Src_Helper; -} - -//===----------------------------------------------------------------------===// -// Vector instruction mappings -//===----------------------------------------------------------------------===// - -// Maps an opcode in e32 form to its e64 equivalent -def getVOPe64 : InstrMapping { - let FilterClass = "VOP"; - let RowFields = ["OpName"]; - let ColFields = ["Size"]; - let KeyCol = ["4"]; - let ValueCols = [["8"]]; -} - -// Maps an opcode in e64 form to its e32 equivalent -def getVOPe32 : InstrMapping { - let FilterClass = "VOP"; - let RowFields = ["OpName"]; - let ColFields = ["Size"]; - let KeyCol = ["8"]; - let ValueCols = [["4"]]; -} - -def getMaskedMIMGOp : InstrMapping { - let FilterClass = "MIMG_Mask"; - let RowFields = ["Op"]; - let ColFields = ["Channels"]; - let KeyCol = ["4"]; - let ValueCols = [["1"], ["2"], ["3"] ]; -} - -// Maps an commuted opcode to its original version -def getCommuteOrig : InstrMapping { - let FilterClass = "VOP2_REV"; - let RowFields = ["RevOp"]; - let ColFields = ["IsOrig"]; - let KeyCol = ["0"]; - let ValueCols = [["1"]]; -} - -// Maps an original opcode to its commuted version -def getCommuteRev : InstrMapping { - let FilterClass = "VOP2_REV"; - let RowFields = ["RevOp"]; - let ColFields = ["IsOrig"]; - let KeyCol = ["1"]; - let ValueCols = [["0"]]; -} - -def getCommuteCmpOrig : InstrMapping { - let FilterClass = "VOP2_REV"; - let RowFields = ["RevOp"]; - let ColFields = ["IsOrig"]; - let KeyCol = ["0"]; - let ValueCols = [["1"]]; -} - -// Maps an original opcode to its commuted version -def getCommuteCmpRev : InstrMapping { - let FilterClass = "VOP2_REV"; - let RowFields = ["RevOp"]; - let ColFields = ["IsOrig"]; - let KeyCol = ["1"]; - let ValueCols = [["0"]]; -} - - -def getMCOpcodeGen : InstrMapping { - let FilterClass = "SIMCInstr"; - let RowFields = ["PseudoInstr"]; - let ColFields = ["Subtarget"]; - let KeyCol = [!cast(SISubtarget.NONE)]; - let ValueCols = [[!cast(SISubtarget.SI)],[!cast(SISubtarget.VI)]]; -} - -def getAddr64Inst : InstrMapping { - let FilterClass = "MUBUFAddr64Table"; - let RowFields = ["OpName"]; - let ColFields = ["IsAddr64"]; - let KeyCol = ["0"]; - let ValueCols = [["1"]]; -} - -// Maps an atomic opcode to its version with a return value. -def getAtomicRetOp : InstrMapping { - let FilterClass = "AtomicNoRet"; - let RowFields = ["NoRetOp"]; - let ColFields = ["IsRet"]; - let KeyCol = ["0"]; - let ValueCols = [["1"]]; -} - -// Maps an atomic opcode to its returnless version. -def getAtomicNoRetOp : InstrMapping { - let FilterClass = "AtomicNoRet"; - let RowFields = ["NoRetOp"]; - let ColFields = ["IsRet"]; - let KeyCol = ["1"]; - let ValueCols = [["0"]]; -} - -include "SIInstructions.td" -include "CIInstructions.td" -include "VIInstructions.td" Index: projects/clang370-import/contrib/llvm/lib/Target/R600/SIRegisterInfo.h =================================================================== --- projects/clang370-import/contrib/llvm/lib/Target/R600/SIRegisterInfo.h (revision 288077) +++ projects/clang370-import/contrib/llvm/lib/Target/R600/SIRegisterInfo.h (nonexistent) @@ -1,131 +0,0 @@ -//===-- SIRegisterInfo.h - SI Register Info Interface ----------*- C++ -*--===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -/// \file -/// \brief Interface definition for SIRegisterInfo -// -//===----------------------------------------------------------------------===// - - -#ifndef LLVM_LIB_TARGET_R600_SIREGISTERINFO_H -#define LLVM_LIB_TARGET_R600_SIREGISTERINFO_H - -#include "AMDGPURegisterInfo.h" -#include "AMDGPUSubtarget.h" -#include "llvm/Support/Debug.h" - -namespace llvm { - -struct SIRegisterInfo : public AMDGPURegisterInfo { - - SIRegisterInfo(); - - BitVector getReservedRegs(const MachineFunction &MF) const override; - - unsigned getRegPressureSetLimit(const MachineFunction &MF, - unsigned Idx) const override; - - bool requiresRegisterScavenging(const MachineFunction &Fn) const override; - - void eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj, - unsigned FIOperandNum, - RegScavenger *RS) const override; - - /// \brief get the register class of the specified type to use in the - /// CFGStructurizer - const TargetRegisterClass * getCFGStructurizerRegClass(MVT VT) const override; - - unsigned getHWRegIndex(unsigned Reg) const override; - - /// \brief Return the 'base' register class for this register. - /// e.g. SGPR0 => SReg_32, VGPR => VGPR_32 SGPR0_SGPR1 -> SReg_32, etc. - const TargetRegisterClass *getPhysRegClass(unsigned Reg) const; - - /// \returns true if this class contains only SGPR registers - bool isSGPRClass(const TargetRegisterClass *RC) const { - if (!RC) - return false; - - return !hasVGPRs(RC); - } - - /// \returns true if this class ID contains only SGPR registers - bool isSGPRClassID(unsigned RCID) const { - if (static_cast(RCID) == -1) - return false; - - return isSGPRClass(getRegClass(RCID)); - } - - /// \returns true if this class contains VGPR registers. - bool hasVGPRs(const TargetRegisterClass *RC) const; - - /// \returns A VGPR reg class with the same width as \p SRC - const TargetRegisterClass *getEquivalentVGPRClass( - const TargetRegisterClass *SRC) const; - - /// \returns The register class that is used for a sub-register of \p RC for - /// the given \p SubIdx. If \p SubIdx equals NoSubRegister, \p RC will - /// be returned. - const TargetRegisterClass *getSubRegClass(const TargetRegisterClass *RC, - unsigned SubIdx) const; - - /// \p Channel This is the register channel (e.g. a value from 0-16), not the - /// SubReg index. - /// \returns The sub-register of Reg that is in Channel. - unsigned getPhysRegSubReg(unsigned Reg, const TargetRegisterClass *SubRC, - unsigned Channel) const; - - /// \returns True if operands defined with this operand type can accept - /// a literal constant (i.e. any 32-bit immediate). - bool opCanUseLiteralConstant(unsigned OpType) const; - - /// \returns True if operands defined with this operand type can accept - /// an inline constant. i.e. An integer value in the range (-16, 64) or - /// -4.0f, -2.0f, -1.0f, -0.5f, 0.0f, 0.5f, 1.0f, 2.0f, 4.0f. - bool opCanUseInlineConstant(unsigned OpType) const; - - enum PreloadedValue { - TGID_X, - TGID_Y, - TGID_Z, - SCRATCH_WAVE_OFFSET, - SCRATCH_PTR, - INPUT_PTR, - TIDIG_X, - TIDIG_Y, - TIDIG_Z - }; - - /// \brief Returns the physical register that \p Value is stored in. - unsigned getPreloadedValue(const MachineFunction &MF, - enum PreloadedValue Value) const; - - /// \brief Give the maximum number of VGPRs that can be used by \p WaveCount - /// concurrent waves. - unsigned getNumVGPRsAllowed(unsigned WaveCount) const; - - /// \brief Give the maximum number of SGPRs that can be used by \p WaveCount - /// concurrent waves. - unsigned getNumSGPRsAllowed(AMDGPUSubtarget::Generation gen, - unsigned WaveCount) const; - - unsigned findUnusedRegister(const MachineRegisterInfo &MRI, - const TargetRegisterClass *RC) const; - -private: - void buildScratchLoadStore(MachineBasicBlock::iterator MI, - unsigned LoadStoreOp, unsigned Value, - unsigned ScratchRsrcReg, unsigned ScratchOffset, - int64_t Offset, RegScavenger *RS) const; -}; - -} // End namespace llvm - -#endif Index: projects/clang370-import/contrib/llvm/lib/Target/R600/AsmParser/AMDGPUAsmParser.cpp =================================================================== --- projects/clang370-import/contrib/llvm/lib/Target/R600/AsmParser/AMDGPUAsmParser.cpp (revision 288077) +++ projects/clang370-import/contrib/llvm/lib/Target/R600/AsmParser/AMDGPUAsmParser.cpp (nonexistent) @@ -1,1315 +0,0 @@ -//===-- AMDGPUAsmParser.cpp - Parse SI asm to MCInst instructions ----------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// - -#include "MCTargetDesc/AMDGPUMCTargetDesc.h" -#include "SIDefines.h" -#include "llvm/ADT/APFloat.h" -#include "llvm/ADT/SmallString.h" -#include "llvm/ADT/SmallVector.h" -#include "llvm/ADT/STLExtras.h" -#include "llvm/ADT/StringSwitch.h" -#include "llvm/ADT/Twine.h" -#include "llvm/MC/MCContext.h" -#include "llvm/MC/MCExpr.h" -#include "llvm/MC/MCInst.h" -#include "llvm/MC/MCInstrInfo.h" -#include "llvm/MC/MCParser/MCAsmLexer.h" -#include "llvm/MC/MCParser/MCAsmParser.h" -#include "llvm/MC/MCParser/MCParsedAsmOperand.h" -#include "llvm/MC/MCRegisterInfo.h" -#include "llvm/MC/MCStreamer.h" -#include "llvm/MC/MCSubtargetInfo.h" -#include "llvm/MC/MCTargetAsmParser.h" -#include "llvm/Support/SourceMgr.h" -#include "llvm/Support/TargetRegistry.h" -#include "llvm/Support/raw_ostream.h" -#include "llvm/Support/Debug.h" - -using namespace llvm; - -namespace { - -struct OptionalOperand; - -class AMDGPUOperand : public MCParsedAsmOperand { - enum KindTy { - Token, - Immediate, - Register, - Expression - } Kind; - - SMLoc StartLoc, EndLoc; - -public: - AMDGPUOperand(enum KindTy K) : MCParsedAsmOperand(), Kind(K) {} - - MCContext *Ctx; - - enum ImmTy { - ImmTyNone, - ImmTyDSOffset0, - ImmTyDSOffset1, - ImmTyGDS, - ImmTyOffset, - ImmTyGLC, - ImmTySLC, - ImmTyTFE, - ImmTyClamp, - ImmTyOMod - }; - - struct TokOp { - const char *Data; - unsigned Length; - }; - - struct ImmOp { - bool IsFPImm; - ImmTy Type; - int64_t Val; - }; - - struct RegOp { - unsigned RegNo; - int Modifiers; - const MCRegisterInfo *TRI; - bool IsForcedVOP3; - }; - - union { - TokOp Tok; - ImmOp Imm; - RegOp Reg; - const MCExpr *Expr; - }; - - void addImmOperands(MCInst &Inst, unsigned N) const { - Inst.addOperand(MCOperand::createImm(getImm())); - } - - StringRef getToken() const { - return StringRef(Tok.Data, Tok.Length); - } - - void addRegOperands(MCInst &Inst, unsigned N) const { - Inst.addOperand(MCOperand::createReg(getReg())); - } - - void addRegOrImmOperands(MCInst &Inst, unsigned N) const { - if (isReg()) - addRegOperands(Inst, N); - else - addImmOperands(Inst, N); - } - - void addRegWithInputModsOperands(MCInst &Inst, unsigned N) const { - Inst.addOperand(MCOperand::createImm( - Reg.Modifiers == -1 ? 0 : Reg.Modifiers)); - addRegOperands(Inst, N); - } - - void addSoppBrTargetOperands(MCInst &Inst, unsigned N) const { - if (isImm()) - addImmOperands(Inst, N); - else { - assert(isExpr()); - Inst.addOperand(MCOperand::createExpr(Expr)); - } - } - - bool defaultTokenHasSuffix() const { - StringRef Token(Tok.Data, Tok.Length); - - return Token.endswith("_e32") || Token.endswith("_e64"); - } - - bool isToken() const override { - return Kind == Token; - } - - bool isImm() const override { - return Kind == Immediate; - } - - bool isInlineImm() const { - float F = BitsToFloat(Imm.Val); - // TODO: Add 0.5pi for VI - return isImm() && ((Imm.Val <= 64 && Imm.Val >= -16) || - (F == 0.0 || F == 0.5 || F == -0.5 || F == 1.0 || F == -1.0 || - F == 2.0 || F == -2.0 || F == 4.0 || F == -4.0)); - } - - bool isDSOffset0() const { - assert(isImm()); - return Imm.Type == ImmTyDSOffset0; - } - - bool isDSOffset1() const { - assert(isImm()); - return Imm.Type == ImmTyDSOffset1; - } - - int64_t getImm() const { - return Imm.Val; - } - - enum ImmTy getImmTy() const { - assert(isImm()); - return Imm.Type; - } - - bool isRegKind() const { - return Kind == Register; - } - - bool isReg() const override { - return Kind == Register && Reg.Modifiers == -1; - } - - bool isRegWithInputMods() const { - return Kind == Register && (Reg.IsForcedVOP3 || Reg.Modifiers != -1); - } - - void setModifiers(unsigned Mods) { - assert(isReg()); - Reg.Modifiers = Mods; - } - - bool hasModifiers() const { - assert(isRegKind()); - return Reg.Modifiers != -1; - } - - unsigned getReg() const override { - return Reg.RegNo; - } - - bool isRegOrImm() const { - return isReg() || isImm(); - } - - bool isRegClass(unsigned RCID) const { - return Reg.TRI->getRegClass(RCID).contains(getReg()); - } - - bool isSCSrc32() const { - return isInlineImm() || (isReg() && isRegClass(AMDGPU::SReg_32RegClassID)); - } - - bool isSSrc32() const { - return isImm() || (isReg() && isRegClass(AMDGPU::SReg_32RegClassID)); - } - - bool isSSrc64() const { - return isImm() || isInlineImm() || - (isReg() && isRegClass(AMDGPU::SReg_64RegClassID)); - } - - bool isVCSrc32() const { - return isInlineImm() || (isReg() && isRegClass(AMDGPU::VS_32RegClassID)); - } - - bool isVCSrc64() const { - return isInlineImm() || (isReg() && isRegClass(AMDGPU::VS_64RegClassID)); - } - - bool isVSrc32() const { - return isImm() || (isReg() && isRegClass(AMDGPU::VS_32RegClassID)); - } - - bool isVSrc64() const { - return isImm() || (isReg() && isRegClass(AMDGPU::VS_64RegClassID)); - } - - bool isMem() const override { - return false; - } - - bool isExpr() const { - return Kind == Expression; - } - - bool isSoppBrTarget() const { - return isExpr() || isImm(); - } - - SMLoc getStartLoc() const override { - return StartLoc; - } - - SMLoc getEndLoc() const override { - return EndLoc; - } - - void print(raw_ostream &OS) const override { } - - static std::unique_ptr CreateImm(int64_t Val, SMLoc Loc, - enum ImmTy Type = ImmTyNone, - bool IsFPImm = false) { - auto Op = llvm::make_unique(Immediate); - Op->Imm.Val = Val; - Op->Imm.IsFPImm = IsFPImm; - Op->Imm.Type = Type; - Op->StartLoc = Loc; - Op->EndLoc = Loc; - return Op; - } - - static std::unique_ptr CreateToken(StringRef Str, SMLoc Loc, - bool HasExplicitEncodingSize = true) { - auto Res = llvm::make_unique(Token); - Res->Tok.Data = Str.data(); - Res->Tok.Length = Str.size(); - Res->StartLoc = Loc; - Res->EndLoc = Loc; - return Res; - } - - static std::unique_ptr CreateReg(unsigned RegNo, SMLoc S, - SMLoc E, - const MCRegisterInfo *TRI, - bool ForceVOP3) { - auto Op = llvm::make_unique(Register); - Op->Reg.RegNo = RegNo; - Op->Reg.TRI = TRI; - Op->Reg.Modifiers = -1; - Op->Reg.IsForcedVOP3 = ForceVOP3; - Op->StartLoc = S; - Op->EndLoc = E; - return Op; - } - - static std::unique_ptr CreateExpr(const class MCExpr *Expr, SMLoc S) { - auto Op = llvm::make_unique(Expression); - Op->Expr = Expr; - Op->StartLoc = S; - Op->EndLoc = S; - return Op; - } - - bool isDSOffset() const; - bool isDSOffset01() const; - bool isSWaitCnt() const; - bool isMubufOffset() const; -}; - -class AMDGPUAsmParser : public MCTargetAsmParser { - MCSubtargetInfo &STI; - const MCInstrInfo &MII; - MCAsmParser &Parser; - - unsigned ForcedEncodingSize; - /// @name Auto-generated Match Functions - /// { - -#define GET_ASSEMBLER_HEADER -#include "AMDGPUGenAsmMatcher.inc" - - /// } - -public: - AMDGPUAsmParser(MCSubtargetInfo &STI, MCAsmParser &_Parser, - const MCInstrInfo &MII, - const MCTargetOptions &Options) - : MCTargetAsmParser(), STI(STI), MII(MII), Parser(_Parser), - ForcedEncodingSize(0){ - - if (STI.getFeatureBits().none()) { - // Set default features. - STI.ToggleFeature("SOUTHERN_ISLANDS"); - } - - setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits())); - } - - unsigned getForcedEncodingSize() const { - return ForcedEncodingSize; - } - - void setForcedEncodingSize(unsigned Size) { - ForcedEncodingSize = Size; - } - - bool isForcedVOP3() const { - return ForcedEncodingSize == 64; - } - - bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) override; - unsigned checkTargetMatchPredicate(MCInst &Inst) override; - bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, - OperandVector &Operands, MCStreamer &Out, - uint64_t &ErrorInfo, - bool MatchingInlineAsm) override; - bool ParseDirective(AsmToken DirectiveID) override; - OperandMatchResultTy parseOperand(OperandVector &Operands, StringRef Mnemonic); - bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name, - SMLoc NameLoc, OperandVector &Operands) override; - - OperandMatchResultTy parseIntWithPrefix(const char *Prefix, int64_t &Int, - int64_t Default = 0); - OperandMatchResultTy parseIntWithPrefix(const char *Prefix, - OperandVector &Operands, - enum AMDGPUOperand::ImmTy ImmTy = - AMDGPUOperand::ImmTyNone); - OperandMatchResultTy parseNamedBit(const char *Name, OperandVector &Operands, - enum AMDGPUOperand::ImmTy ImmTy = - AMDGPUOperand::ImmTyNone); - OperandMatchResultTy parseOptionalOps( - const ArrayRef &OptionalOps, - OperandVector &Operands); - - - void cvtDSOffset01(MCInst &Inst, const OperandVector &Operands); - void cvtDS(MCInst &Inst, const OperandVector &Operands); - OperandMatchResultTy parseDSOptionalOps(OperandVector &Operands); - OperandMatchResultTy parseDSOff01OptionalOps(OperandVector &Operands); - OperandMatchResultTy parseDSOffsetOptional(OperandVector &Operands); - - bool parseCnt(int64_t &IntVal); - OperandMatchResultTy parseSWaitCntOps(OperandVector &Operands); - OperandMatchResultTy parseSOppBrTarget(OperandVector &Operands); - - void cvtMubuf(MCInst &Inst, const OperandVector &Operands); - OperandMatchResultTy parseOffset(OperandVector &Operands); - OperandMatchResultTy parseMubufOptionalOps(OperandVector &Operands); - OperandMatchResultTy parseGLC(OperandVector &Operands); - OperandMatchResultTy parseSLC(OperandVector &Operands); - OperandMatchResultTy parseTFE(OperandVector &Operands); - - OperandMatchResultTy parseDMask(OperandVector &Operands); - OperandMatchResultTy parseUNorm(OperandVector &Operands); - OperandMatchResultTy parseR128(OperandVector &Operands); - - void cvtVOP3(MCInst &Inst, const OperandVector &Operands); - OperandMatchResultTy parseVOP3OptionalOps(OperandVector &Operands); -}; - -struct OptionalOperand { - const char *Name; - AMDGPUOperand::ImmTy Type; - bool IsBit; - int64_t Default; - bool (*ConvertResult)(int64_t&); -}; - -} - -static unsigned getRegClass(bool IsVgpr, unsigned RegWidth) { - if (IsVgpr) { - switch (RegWidth) { - default: llvm_unreachable("Unknown register width"); - case 1: return AMDGPU::VGPR_32RegClassID; - case 2: return AMDGPU::VReg_64RegClassID; - case 3: return AMDGPU::VReg_96RegClassID; - case 4: return AMDGPU::VReg_128RegClassID; - case 8: return AMDGPU::VReg_256RegClassID; - case 16: return AMDGPU::VReg_512RegClassID; - } - } - - switch (RegWidth) { - default: llvm_unreachable("Unknown register width"); - case 1: return AMDGPU::SGPR_32RegClassID; - case 2: return AMDGPU::SGPR_64RegClassID; - case 4: return AMDGPU::SReg_128RegClassID; - case 8: return AMDGPU::SReg_256RegClassID; - case 16: return AMDGPU::SReg_512RegClassID; - } -} - -static unsigned getRegForName(const StringRef &RegName) { - - return StringSwitch(RegName) - .Case("exec", AMDGPU::EXEC) - .Case("vcc", AMDGPU::VCC) - .Case("flat_scr", AMDGPU::FLAT_SCR) - .Case("m0", AMDGPU::M0) - .Case("scc", AMDGPU::SCC) - .Case("flat_scr_lo", AMDGPU::FLAT_SCR_LO) - .Case("flat_scr_hi", AMDGPU::FLAT_SCR_HI) - .Case("vcc_lo", AMDGPU::VCC_LO) - .Case("vcc_hi", AMDGPU::VCC_HI) - .Case("exec_lo", AMDGPU::EXEC_LO) - .Case("exec_hi", AMDGPU::EXEC_HI) - .Default(0); -} - -bool AMDGPUAsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) { - const AsmToken Tok = Parser.getTok(); - StartLoc = Tok.getLoc(); - EndLoc = Tok.getEndLoc(); - const StringRef &RegName = Tok.getString(); - RegNo = getRegForName(RegName); - - if (RegNo) { - Parser.Lex(); - return false; - } - - // Match vgprs and sgprs - if (RegName[0] != 's' && RegName[0] != 'v') - return true; - - bool IsVgpr = RegName[0] == 'v'; - unsigned RegWidth; - unsigned RegIndexInClass; - if (RegName.size() > 1) { - // We have a 32-bit register - RegWidth = 1; - if (RegName.substr(1).getAsInteger(10, RegIndexInClass)) - return true; - Parser.Lex(); - } else { - // We have a register greater than 32-bits. - - int64_t RegLo, RegHi; - Parser.Lex(); - if (getLexer().isNot(AsmToken::LBrac)) - return true; - - Parser.Lex(); - if (getParser().parseAbsoluteExpression(RegLo)) - return true; - - if (getLexer().isNot(AsmToken::Colon)) - return true; - - Parser.Lex(); - if (getParser().parseAbsoluteExpression(RegHi)) - return true; - - if (getLexer().isNot(AsmToken::RBrac)) - return true; - - Parser.Lex(); - RegWidth = (RegHi - RegLo) + 1; - if (IsVgpr) { - // VGPR registers aren't aligned. - RegIndexInClass = RegLo; - } else { - // SGPR registers are aligned. Max alignment is 4 dwords. - RegIndexInClass = RegLo / std::min(RegWidth, 4u); - } - } - - const MCRegisterInfo *TRC = getContext().getRegisterInfo(); - unsigned RC = getRegClass(IsVgpr, RegWidth); - if (RegIndexInClass > TRC->getRegClass(RC).getNumRegs()) - return true; - RegNo = TRC->getRegClass(RC).getRegister(RegIndexInClass); - return false; -} - -unsigned AMDGPUAsmParser::checkTargetMatchPredicate(MCInst &Inst) { - - uint64_t TSFlags = MII.get(Inst.getOpcode()).TSFlags; - - if ((getForcedEncodingSize() == 32 && (TSFlags & SIInstrFlags::VOP3)) || - (getForcedEncodingSize() == 64 && !(TSFlags & SIInstrFlags::VOP3))) - return Match_InvalidOperand; - - return Match_Success; -} - - -bool AMDGPUAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, - OperandVector &Operands, - MCStreamer &Out, - uint64_t &ErrorInfo, - bool MatchingInlineAsm) { - MCInst Inst; - - switch (MatchInstructionImpl(Operands, Inst, ErrorInfo, MatchingInlineAsm)) { - default: break; - case Match_Success: - Inst.setLoc(IDLoc); - Out.EmitInstruction(Inst, STI); - return false; - case Match_MissingFeature: - return Error(IDLoc, "instruction not supported on this GPU"); - - case Match_MnemonicFail: - return Error(IDLoc, "unrecognized instruction mnemonic"); - - case Match_InvalidOperand: { - SMLoc ErrorLoc = IDLoc; - if (ErrorInfo != ~0ULL) { - if (ErrorInfo >= Operands.size()) { - if (isForcedVOP3()) { - // If 64-bit encoding has been forced we can end up with no - // clamp or omod operands if none of the registers have modifiers, - // so we need to add these to the operand list. - AMDGPUOperand &LastOp = - ((AMDGPUOperand &)*Operands[Operands.size() - 1]); - if (LastOp.isRegKind() || - (LastOp.isImm() && - LastOp.getImmTy() != AMDGPUOperand::ImmTyNone)) { - SMLoc S = Parser.getTok().getLoc(); - Operands.push_back(AMDGPUOperand::CreateImm(0, S, - AMDGPUOperand::ImmTyClamp)); - Operands.push_back(AMDGPUOperand::CreateImm(0, S, - AMDGPUOperand::ImmTyOMod)); - bool Res = MatchAndEmitInstruction(IDLoc, Opcode, Operands, - Out, ErrorInfo, - MatchingInlineAsm); - if (!Res) - return Res; - } - - } - return Error(IDLoc, "too few operands for instruction"); - } - - ErrorLoc = ((AMDGPUOperand &)*Operands[ErrorInfo]).getStartLoc(); - if (ErrorLoc == SMLoc()) - ErrorLoc = IDLoc; - } - return Error(ErrorLoc, "invalid operand for instruction"); - } - } - llvm_unreachable("Implement any new match types added!"); -} - -bool AMDGPUAsmParser::ParseDirective(AsmToken DirectiveID) { - return true; -} - -static bool operandsHaveModifiers(const OperandVector &Operands) { - - for (unsigned i = 0, e = Operands.size(); i != e; ++i) { - const AMDGPUOperand &Op = ((AMDGPUOperand&)*Operands[i]); - if (Op.isRegKind() && Op.hasModifiers()) - return true; - if (Op.isImm() && (Op.getImmTy() == AMDGPUOperand::ImmTyOMod || - Op.getImmTy() == AMDGPUOperand::ImmTyClamp)) - return true; - } - return false; -} - -AMDGPUAsmParser::OperandMatchResultTy -AMDGPUAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic) { - - // Try to parse with a custom parser - OperandMatchResultTy ResTy = MatchOperandParserImpl(Operands, Mnemonic); - - // If we successfully parsed the operand or if there as an error parsing, - // we are done. - // - // If we are parsing after we reach EndOfStatement then this means we - // are appending default values to the Operands list. This is only done - // by custom parser, so we shouldn't continue on to the generic parsing. - if (ResTy == MatchOperand_Success || ResTy == MatchOperand_ParseFail || - getLexer().is(AsmToken::EndOfStatement)) - return ResTy; - - bool Negate = false, Abs = false; - if (getLexer().getKind()== AsmToken::Minus) { - Parser.Lex(); - Negate = true; - } - - if (getLexer().getKind() == AsmToken::Pipe) { - Parser.Lex(); - Abs = true; - } - - switch(getLexer().getKind()) { - case AsmToken::Integer: { - SMLoc S = Parser.getTok().getLoc(); - int64_t IntVal; - if (getParser().parseAbsoluteExpression(IntVal)) - return MatchOperand_ParseFail; - APInt IntVal32(32, IntVal); - if (IntVal32.getSExtValue() != IntVal) { - Error(S, "invalid immediate: only 32-bit values are legal"); - return MatchOperand_ParseFail; - } - - IntVal = IntVal32.getSExtValue(); - if (Negate) - IntVal *= -1; - Operands.push_back(AMDGPUOperand::CreateImm(IntVal, S)); - return MatchOperand_Success; - } - case AsmToken::Real: { - // FIXME: We should emit an error if a double precisions floating-point - // value is used. I'm not sure the best way to detect this. - SMLoc S = Parser.getTok().getLoc(); - int64_t IntVal; - if (getParser().parseAbsoluteExpression(IntVal)) - return MatchOperand_ParseFail; - - APFloat F((float)BitsToDouble(IntVal)); - if (Negate) - F.changeSign(); - Operands.push_back( - AMDGPUOperand::CreateImm(F.bitcastToAPInt().getZExtValue(), S)); - return MatchOperand_Success; - } - case AsmToken::Identifier: { - SMLoc S, E; - unsigned RegNo; - if (!ParseRegister(RegNo, S, E)) { - - bool HasModifiers = operandsHaveModifiers(Operands); - unsigned Modifiers = 0; - - if (Negate) - Modifiers |= 0x1; - - if (Abs) { - if (getLexer().getKind() != AsmToken::Pipe) - return MatchOperand_ParseFail; - Parser.Lex(); - Modifiers |= 0x2; - } - - if (Modifiers && !HasModifiers) { - // We are adding a modifier to src1 or src2 and previous sources - // don't have modifiers, so we need to go back and empty modifers - // for each previous source. - for (unsigned PrevRegIdx = Operands.size() - 1; PrevRegIdx > 1; - --PrevRegIdx) { - - AMDGPUOperand &RegOp = ((AMDGPUOperand&)*Operands[PrevRegIdx]); - RegOp.setModifiers(0); - } - } - - - Operands.push_back(AMDGPUOperand::CreateReg( - RegNo, S, E, getContext().getRegisterInfo(), - isForcedVOP3())); - - if (HasModifiers || Modifiers) { - AMDGPUOperand &RegOp = ((AMDGPUOperand&)*Operands[Operands.size() - 1]); - RegOp.setModifiers(Modifiers); - - } - } else { - Operands.push_back(AMDGPUOperand::CreateToken(Parser.getTok().getString(), - S)); - Parser.Lex(); - } - return MatchOperand_Success; - } - default: - return MatchOperand_NoMatch; - } -} - -bool AMDGPUAsmParser::ParseInstruction(ParseInstructionInfo &Info, - StringRef Name, - SMLoc NameLoc, OperandVector &Operands) { - - // Clear any forced encodings from the previous instruction. - setForcedEncodingSize(0); - - if (Name.endswith("_e64")) - setForcedEncodingSize(64); - else if (Name.endswith("_e32")) - setForcedEncodingSize(32); - - // Add the instruction mnemonic - Operands.push_back(AMDGPUOperand::CreateToken(Name, NameLoc)); - - while (!getLexer().is(AsmToken::EndOfStatement)) { - AMDGPUAsmParser::OperandMatchResultTy Res = parseOperand(Operands, Name); - - // Eat the comma or space if there is one. - if (getLexer().is(AsmToken::Comma)) - Parser.Lex(); - - switch (Res) { - case MatchOperand_Success: break; - case MatchOperand_ParseFail: return Error(getLexer().getLoc(), - "failed parsing operand."); - case MatchOperand_NoMatch: return Error(getLexer().getLoc(), - "not a valid operand."); - } - } - - // Once we reach end of statement, continue parsing so we can add default - // values for optional arguments. - AMDGPUAsmParser::OperandMatchResultTy Res; - while ((Res = parseOperand(Operands, Name)) != MatchOperand_NoMatch) { - if (Res != MatchOperand_Success) - return Error(getLexer().getLoc(), "failed parsing operand."); - } - return false; -} - -//===----------------------------------------------------------------------===// -// Utility functions -//===----------------------------------------------------------------------===// - -AMDGPUAsmParser::OperandMatchResultTy -AMDGPUAsmParser::parseIntWithPrefix(const char *Prefix, int64_t &Int, - int64_t Default) { - - // We are at the end of the statement, and this is a default argument, so - // use a default value. - if (getLexer().is(AsmToken::EndOfStatement)) { - Int = Default; - return MatchOperand_Success; - } - - switch(getLexer().getKind()) { - default: return MatchOperand_NoMatch; - case AsmToken::Identifier: { - StringRef OffsetName = Parser.getTok().getString(); - if (!OffsetName.equals(Prefix)) - return MatchOperand_NoMatch; - - Parser.Lex(); - if (getLexer().isNot(AsmToken::Colon)) - return MatchOperand_ParseFail; - - Parser.Lex(); - if (getLexer().isNot(AsmToken::Integer)) - return MatchOperand_ParseFail; - - if (getParser().parseAbsoluteExpression(Int)) - return MatchOperand_ParseFail; - break; - } - } - return MatchOperand_Success; -} - -AMDGPUAsmParser::OperandMatchResultTy -AMDGPUAsmParser::parseIntWithPrefix(const char *Prefix, OperandVector &Operands, - enum AMDGPUOperand::ImmTy ImmTy) { - - SMLoc S = Parser.getTok().getLoc(); - int64_t Offset = 0; - - AMDGPUAsmParser::OperandMatchResultTy Res = parseIntWithPrefix(Prefix, Offset); - if (Res != MatchOperand_Success) - return Res; - - Operands.push_back(AMDGPUOperand::CreateImm(Offset, S, ImmTy)); - return MatchOperand_Success; -} - -AMDGPUAsmParser::OperandMatchResultTy -AMDGPUAsmParser::parseNamedBit(const char *Name, OperandVector &Operands, - enum AMDGPUOperand::ImmTy ImmTy) { - int64_t Bit = 0; - SMLoc S = Parser.getTok().getLoc(); - - // We are at the end of the statement, and this is a default argument, so - // use a default value. - if (getLexer().isNot(AsmToken::EndOfStatement)) { - switch(getLexer().getKind()) { - case AsmToken::Identifier: { - StringRef Tok = Parser.getTok().getString(); - if (Tok == Name) { - Bit = 1; - Parser.Lex(); - } else if (Tok.startswith("no") && Tok.endswith(Name)) { - Bit = 0; - Parser.Lex(); - } else { - return MatchOperand_NoMatch; - } - break; - } - default: - return MatchOperand_NoMatch; - } - } - - Operands.push_back(AMDGPUOperand::CreateImm(Bit, S, ImmTy)); - return MatchOperand_Success; -} - -static bool operandsHasOptionalOp(const OperandVector &Operands, - const OptionalOperand &OOp) { - for (unsigned i = 0; i < Operands.size(); i++) { - const AMDGPUOperand &ParsedOp = ((const AMDGPUOperand &)*Operands[i]); - if ((ParsedOp.isImm() && ParsedOp.getImmTy() == OOp.Type) || - (ParsedOp.isToken() && ParsedOp.getToken() == OOp.Name)) - return true; - - } - return false; -} - -AMDGPUAsmParser::OperandMatchResultTy -AMDGPUAsmParser::parseOptionalOps(const ArrayRef &OptionalOps, - OperandVector &Operands) { - SMLoc S = Parser.getTok().getLoc(); - for (const OptionalOperand &Op : OptionalOps) { - if (operandsHasOptionalOp(Operands, Op)) - continue; - AMDGPUAsmParser::OperandMatchResultTy Res; - int64_t Value; - if (Op.IsBit) { - Res = parseNamedBit(Op.Name, Operands, Op.Type); - if (Res == MatchOperand_NoMatch) - continue; - return Res; - } - - Res = parseIntWithPrefix(Op.Name, Value, Op.Default); - - if (Res == MatchOperand_NoMatch) - continue; - - if (Res != MatchOperand_Success) - return Res; - - if (Op.ConvertResult && !Op.ConvertResult(Value)) { - return MatchOperand_ParseFail; - } - - Operands.push_back(AMDGPUOperand::CreateImm(Value, S, Op.Type)); - return MatchOperand_Success; - } - return MatchOperand_NoMatch; -} - -//===----------------------------------------------------------------------===// -// ds -//===----------------------------------------------------------------------===// - -static const OptionalOperand DSOptionalOps [] = { - {"offset", AMDGPUOperand::ImmTyOffset, false, 0, nullptr}, - {"gds", AMDGPUOperand::ImmTyGDS, true, 0, nullptr} -}; - -static const OptionalOperand DSOptionalOpsOff01 [] = { - {"offset0", AMDGPUOperand::ImmTyDSOffset0, false, 0, nullptr}, - {"offset1", AMDGPUOperand::ImmTyDSOffset1, false, 0, nullptr}, - {"gds", AMDGPUOperand::ImmTyGDS, true, 0, nullptr} -}; - -AMDGPUAsmParser::OperandMatchResultTy -AMDGPUAsmParser::parseDSOptionalOps(OperandVector &Operands) { - return parseOptionalOps(DSOptionalOps, Operands); -} -AMDGPUAsmParser::OperandMatchResultTy -AMDGPUAsmParser::parseDSOff01OptionalOps(OperandVector &Operands) { - return parseOptionalOps(DSOptionalOpsOff01, Operands); -} - -AMDGPUAsmParser::OperandMatchResultTy -AMDGPUAsmParser::parseDSOffsetOptional(OperandVector &Operands) { - SMLoc S = Parser.getTok().getLoc(); - AMDGPUAsmParser::OperandMatchResultTy Res = - parseIntWithPrefix("offset", Operands, AMDGPUOperand::ImmTyOffset); - if (Res == MatchOperand_NoMatch) { - Operands.push_back(AMDGPUOperand::CreateImm(0, S, - AMDGPUOperand::ImmTyOffset)); - Res = MatchOperand_Success; - } - return Res; -} - -bool AMDGPUOperand::isDSOffset() const { - return isImm() && isUInt<16>(getImm()); -} - -bool AMDGPUOperand::isDSOffset01() const { - return isImm() && isUInt<8>(getImm()); -} - -void AMDGPUAsmParser::cvtDSOffset01(MCInst &Inst, - const OperandVector &Operands) { - - std::map OptionalIdx; - - for (unsigned i = 1, e = Operands.size(); i != e; ++i) { - AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[i]); - - // Add the register arguments - if (Op.isReg()) { - Op.addRegOperands(Inst, 1); - continue; - } - - // Handle optional arguments - OptionalIdx[Op.getImmTy()] = i; - } - - unsigned Offset0Idx = OptionalIdx[AMDGPUOperand::ImmTyDSOffset0]; - unsigned Offset1Idx = OptionalIdx[AMDGPUOperand::ImmTyDSOffset1]; - unsigned GDSIdx = OptionalIdx[AMDGPUOperand::ImmTyGDS]; - - ((AMDGPUOperand &)*Operands[Offset0Idx]).addImmOperands(Inst, 1); // offset0 - ((AMDGPUOperand &)*Operands[Offset1Idx]).addImmOperands(Inst, 1); // offset1 - ((AMDGPUOperand &)*Operands[GDSIdx]).addImmOperands(Inst, 1); // gds - Inst.addOperand(MCOperand::createReg(AMDGPU::M0)); // m0 -} - -void AMDGPUAsmParser::cvtDS(MCInst &Inst, const OperandVector &Operands) { - - std::map OptionalIdx; - bool GDSOnly = false; - - for (unsigned i = 1, e = Operands.size(); i != e; ++i) { - AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[i]); - - // Add the register arguments - if (Op.isReg()) { - Op.addRegOperands(Inst, 1); - continue; - } - - if (Op.isToken() && Op.getToken() == "gds") { - GDSOnly = true; - continue; - } - - // Handle optional arguments - OptionalIdx[Op.getImmTy()] = i; - } - - unsigned OffsetIdx = OptionalIdx[AMDGPUOperand::ImmTyOffset]; - ((AMDGPUOperand &)*Operands[OffsetIdx]).addImmOperands(Inst, 1); // offset - - if (!GDSOnly) { - unsigned GDSIdx = OptionalIdx[AMDGPUOperand::ImmTyGDS]; - ((AMDGPUOperand &)*Operands[GDSIdx]).addImmOperands(Inst, 1); // gds - } - Inst.addOperand(MCOperand::createReg(AMDGPU::M0)); // m0 -} - - -//===----------------------------------------------------------------------===// -// s_waitcnt -//===----------------------------------------------------------------------===// - -bool AMDGPUAsmParser::parseCnt(int64_t &IntVal) { - StringRef CntName = Parser.getTok().getString(); - int64_t CntVal; - - Parser.Lex(); - if (getLexer().isNot(AsmToken::LParen)) - return true; - - Parser.Lex(); - if (getLexer().isNot(AsmToken::Integer)) - return true; - - if (getParser().parseAbsoluteExpression(CntVal)) - return true; - - if (getLexer().isNot(AsmToken::RParen)) - return true; - - Parser.Lex(); - if (getLexer().is(AsmToken::Amp) || getLexer().is(AsmToken::Comma)) - Parser.Lex(); - - int CntShift; - int CntMask; - - if (CntName == "vmcnt") { - CntMask = 0xf; - CntShift = 0; - } else if (CntName == "expcnt") { - CntMask = 0x7; - CntShift = 4; - } else if (CntName == "lgkmcnt") { - CntMask = 0x7; - CntShift = 8; - } else { - return true; - } - - IntVal &= ~(CntMask << CntShift); - IntVal |= (CntVal << CntShift); - return false; -} - -AMDGPUAsmParser::OperandMatchResultTy -AMDGPUAsmParser::parseSWaitCntOps(OperandVector &Operands) { - // Disable all counters by default. - // vmcnt [3:0] - // expcnt [6:4] - // lgkmcnt [10:8] - int64_t CntVal = 0x77f; - SMLoc S = Parser.getTok().getLoc(); - - switch(getLexer().getKind()) { - default: return MatchOperand_ParseFail; - case AsmToken::Integer: - // The operand can be an integer value. - if (getParser().parseAbsoluteExpression(CntVal)) - return MatchOperand_ParseFail; - break; - - case AsmToken::Identifier: - do { - if (parseCnt(CntVal)) - return MatchOperand_ParseFail; - } while(getLexer().isNot(AsmToken::EndOfStatement)); - break; - } - Operands.push_back(AMDGPUOperand::CreateImm(CntVal, S)); - return MatchOperand_Success; -} - -bool AMDGPUOperand::isSWaitCnt() const { - return isImm(); -} - -//===----------------------------------------------------------------------===// -// sopp branch targets -//===----------------------------------------------------------------------===// - -AMDGPUAsmParser::OperandMatchResultTy -AMDGPUAsmParser::parseSOppBrTarget(OperandVector &Operands) { - SMLoc S = Parser.getTok().getLoc(); - - switch (getLexer().getKind()) { - default: return MatchOperand_ParseFail; - case AsmToken::Integer: { - int64_t Imm; - if (getParser().parseAbsoluteExpression(Imm)) - return MatchOperand_ParseFail; - Operands.push_back(AMDGPUOperand::CreateImm(Imm, S)); - return MatchOperand_Success; - } - - case AsmToken::Identifier: - Operands.push_back(AMDGPUOperand::CreateExpr( - MCSymbolRefExpr::create(getContext().getOrCreateSymbol( - Parser.getTok().getString()), getContext()), S)); - Parser.Lex(); - return MatchOperand_Success; - } -} - -//===----------------------------------------------------------------------===// -// mubuf -//===----------------------------------------------------------------------===// - -static const OptionalOperand MubufOptionalOps [] = { - {"offset", AMDGPUOperand::ImmTyOffset, false, 0, nullptr}, - {"glc", AMDGPUOperand::ImmTyGLC, true, 0, nullptr}, - {"slc", AMDGPUOperand::ImmTySLC, true, 0, nullptr}, - {"tfe", AMDGPUOperand::ImmTyTFE, true, 0, nullptr} -}; - -AMDGPUAsmParser::OperandMatchResultTy -AMDGPUAsmParser::parseMubufOptionalOps(OperandVector &Operands) { - return parseOptionalOps(MubufOptionalOps, Operands); -} - -AMDGPUAsmParser::OperandMatchResultTy -AMDGPUAsmParser::parseOffset(OperandVector &Operands) { - return parseIntWithPrefix("offset", Operands); -} - -AMDGPUAsmParser::OperandMatchResultTy -AMDGPUAsmParser::parseGLC(OperandVector &Operands) { - return parseNamedBit("glc", Operands); -} - -AMDGPUAsmParser::OperandMatchResultTy -AMDGPUAsmParser::parseSLC(OperandVector &Operands) { - return parseNamedBit("slc", Operands); -} - -AMDGPUAsmParser::OperandMatchResultTy -AMDGPUAsmParser::parseTFE(OperandVector &Operands) { - return parseNamedBit("tfe", Operands); -} - -bool AMDGPUOperand::isMubufOffset() const { - return isImm() && isUInt<12>(getImm()); -} - -void AMDGPUAsmParser::cvtMubuf(MCInst &Inst, - const OperandVector &Operands) { - std::map OptionalIdx; - - for (unsigned i = 1, e = Operands.size(); i != e; ++i) { - AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[i]); - - // Add the register arguments - if (Op.isReg()) { - Op.addRegOperands(Inst, 1); - continue; - } - - // Handle the case where soffset is an immediate - if (Op.isImm() && Op.getImmTy() == AMDGPUOperand::ImmTyNone) { - Op.addImmOperands(Inst, 1); - continue; - } - - // Handle tokens like 'offen' which are sometimes hard-coded into the - // asm string. There are no MCInst operands for these. - if (Op.isToken()) { - continue; - } - assert(Op.isImm()); - - // Handle optional arguments - OptionalIdx[Op.getImmTy()] = i; - } - - assert(OptionalIdx.size() == 4); - - unsigned OffsetIdx = OptionalIdx[AMDGPUOperand::ImmTyOffset]; - unsigned GLCIdx = OptionalIdx[AMDGPUOperand::ImmTyGLC]; - unsigned SLCIdx = OptionalIdx[AMDGPUOperand::ImmTySLC]; - unsigned TFEIdx = OptionalIdx[AMDGPUOperand::ImmTyTFE]; - - ((AMDGPUOperand &)*Operands[OffsetIdx]).addImmOperands(Inst, 1); - ((AMDGPUOperand &)*Operands[GLCIdx]).addImmOperands(Inst, 1); - ((AMDGPUOperand &)*Operands[SLCIdx]).addImmOperands(Inst, 1); - ((AMDGPUOperand &)*Operands[TFEIdx]).addImmOperands(Inst, 1); -} - -//===----------------------------------------------------------------------===// -// mimg -//===----------------------------------------------------------------------===// - -AMDGPUAsmParser::OperandMatchResultTy -AMDGPUAsmParser::parseDMask(OperandVector &Operands) { - return parseIntWithPrefix("dmask", Operands); -} - -AMDGPUAsmParser::OperandMatchResultTy -AMDGPUAsmParser::parseUNorm(OperandVector &Operands) { - return parseNamedBit("unorm", Operands); -} - -AMDGPUAsmParser::OperandMatchResultTy -AMDGPUAsmParser::parseR128(OperandVector &Operands) { - return parseNamedBit("r128", Operands); -} - -//===----------------------------------------------------------------------===// -// vop3 -//===----------------------------------------------------------------------===// - -static bool ConvertOmodMul(int64_t &Mul) { - if (Mul != 1 && Mul != 2 && Mul != 4) - return false; - - Mul >>= 1; - return true; -} - -static bool ConvertOmodDiv(int64_t &Div) { - if (Div == 1) { - Div = 0; - return true; - } - - if (Div == 2) { - Div = 3; - return true; - } - - return false; -} - -static const OptionalOperand VOP3OptionalOps [] = { - {"clamp", AMDGPUOperand::ImmTyClamp, true, 0, nullptr}, - {"mul", AMDGPUOperand::ImmTyOMod, false, 1, ConvertOmodMul}, - {"div", AMDGPUOperand::ImmTyOMod, false, 1, ConvertOmodDiv}, -}; - -static bool isVOP3(OperandVector &Operands) { - if (operandsHaveModifiers(Operands)) - return true; - - AMDGPUOperand &DstOp = ((AMDGPUOperand&)*Operands[1]); - - if (DstOp.isReg() && DstOp.isRegClass(AMDGPU::SGPR_64RegClassID)) - return true; - - if (Operands.size() >= 5) - return true; - - if (Operands.size() > 3) { - AMDGPUOperand &Src1Op = ((AMDGPUOperand&)*Operands[3]); - if (Src1Op.getReg() && (Src1Op.isRegClass(AMDGPU::SReg_32RegClassID) || - Src1Op.isRegClass(AMDGPU::SReg_64RegClassID))) - return true; - } - return false; -} - -AMDGPUAsmParser::OperandMatchResultTy -AMDGPUAsmParser::parseVOP3OptionalOps(OperandVector &Operands) { - - // The value returned by this function may change after parsing - // an operand so store the original value here. - bool HasModifiers = operandsHaveModifiers(Operands); - - bool IsVOP3 = isVOP3(Operands); - if (HasModifiers || IsVOP3 || - getLexer().isNot(AsmToken::EndOfStatement) || - getForcedEncodingSize() == 64) { - - AMDGPUAsmParser::OperandMatchResultTy Res = - parseOptionalOps(VOP3OptionalOps, Operands); - - if (!HasModifiers && Res == MatchOperand_Success) { - // We have added a modifier operation, so we need to make sure all - // previous register operands have modifiers - for (unsigned i = 2, e = Operands.size(); i != e; ++i) { - AMDGPUOperand &Op = ((AMDGPUOperand&)*Operands[i]); - if (Op.isReg()) - Op.setModifiers(0); - } - } - return Res; - } - return MatchOperand_NoMatch; -} - -void AMDGPUAsmParser::cvtVOP3(MCInst &Inst, const OperandVector &Operands) { - ((AMDGPUOperand &)*Operands[1]).addRegOperands(Inst, 1); - unsigned i = 2; - - std::map OptionalIdx; - - if (operandsHaveModifiers(Operands)) { - for (unsigned e = Operands.size(); i != e; ++i) { - AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[i]); - - if (Op.isRegWithInputMods()) { - ((AMDGPUOperand &)*Operands[i]).addRegWithInputModsOperands(Inst, 2); - continue; - } - OptionalIdx[Op.getImmTy()] = i; - } - - unsigned ClampIdx = OptionalIdx[AMDGPUOperand::ImmTyClamp]; - unsigned OModIdx = OptionalIdx[AMDGPUOperand::ImmTyOMod]; - - ((AMDGPUOperand &)*Operands[ClampIdx]).addImmOperands(Inst, 1); - ((AMDGPUOperand &)*Operands[OModIdx]).addImmOperands(Inst, 1); - } else { - for (unsigned e = Operands.size(); i != e; ++i) - ((AMDGPUOperand &)*Operands[i]).addRegOrImmOperands(Inst, 1); - } -} - -/// Force static initialization. -extern "C" void LLVMInitializeR600AsmParser() { - RegisterMCAsmParser A(TheAMDGPUTarget); - RegisterMCAsmParser B(TheGCNTarget); -} - -#define GET_REGISTER_MATCHER -#define GET_MATCHER_IMPLEMENTATION -#include "AMDGPUGenAsmMatcher.inc" - Property changes on: projects/clang370-import/contrib/llvm/lib/Target/R600/AsmParser/AMDGPUAsmParser.cpp ___________________________________________________________________ Deleted: svn:eol-style ## -1 +0,0 ## -native \ No newline at end of property Deleted: svn:keywords ## -1 +0,0 ## -FreeBSD=%H \ No newline at end of property Deleted: svn:mime-type ## -1 +0,0 ## -text/plain \ No newline at end of property Index: projects/clang370-import/contrib/llvm/lib/Target/R600/SIPrepareScratchRegs.cpp =================================================================== --- projects/clang370-import/contrib/llvm/lib/Target/R600/SIPrepareScratchRegs.cpp (revision 288077) +++ projects/clang370-import/contrib/llvm/lib/Target/R600/SIPrepareScratchRegs.cpp (nonexistent) @@ -1,194 +0,0 @@ -//===-- SIPrepareScratchRegs.cpp - Use predicates for control flow --------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -/// \file -/// -/// This pass loads scratch pointer and scratch offset into a register or a -/// frame index which can be used anywhere in the program. These values will -/// be used for spilling VGPRs. -/// -//===----------------------------------------------------------------------===// - -#include "AMDGPU.h" -#include "AMDGPUSubtarget.h" -#include "SIDefines.h" -#include "SIInstrInfo.h" -#include "SIMachineFunctionInfo.h" -#include "llvm/CodeGen/MachineFrameInfo.h" -#include "llvm/CodeGen/MachineFunction.h" -#include "llvm/CodeGen/MachineFunctionPass.h" -#include "llvm/CodeGen/MachineInstrBuilder.h" -#include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/CodeGen/RegisterScavenging.h" -#include "llvm/IR/Function.h" -#include "llvm/IR/LLVMContext.h" - -using namespace llvm; - -namespace { - -class SIPrepareScratchRegs : public MachineFunctionPass { - -private: - static char ID; - -public: - SIPrepareScratchRegs() : MachineFunctionPass(ID) { } - - bool runOnMachineFunction(MachineFunction &MF) override; - - const char *getPassName() const override { - return "SI prepare scratch registers"; - } - -}; - -} // End anonymous namespace - -char SIPrepareScratchRegs::ID = 0; - -FunctionPass *llvm::createSIPrepareScratchRegs() { - return new SIPrepareScratchRegs(); -} - -bool SIPrepareScratchRegs::runOnMachineFunction(MachineFunction &MF) { - SIMachineFunctionInfo *MFI = MF.getInfo(); - const SIInstrInfo *TII = - static_cast(MF.getSubtarget().getInstrInfo()); - const SIRegisterInfo *TRI = &TII->getRegisterInfo(); - MachineRegisterInfo &MRI = MF.getRegInfo(); - MachineFrameInfo *FrameInfo = MF.getFrameInfo(); - MachineBasicBlock *Entry = MF.begin(); - MachineBasicBlock::iterator I = Entry->begin(); - DebugLoc DL = I->getDebugLoc(); - - // FIXME: If we don't have enough VGPRs for SGPR spilling we will need to - // run this pass. - if (!MFI->hasSpilledVGPRs()) - return false; - - unsigned ScratchPtrPreloadReg = - TRI->getPreloadedValue(MF, SIRegisterInfo::SCRATCH_PTR); - unsigned ScratchOffsetPreloadReg = - TRI->getPreloadedValue(MF, SIRegisterInfo::SCRATCH_WAVE_OFFSET); - - if (!Entry->isLiveIn(ScratchPtrPreloadReg)) - Entry->addLiveIn(ScratchPtrPreloadReg); - - if (!Entry->isLiveIn(ScratchOffsetPreloadReg)) - Entry->addLiveIn(ScratchOffsetPreloadReg); - - // Load the scratch offset. - unsigned ScratchOffsetReg = - TRI->findUnusedRegister(MRI, &AMDGPU::SGPR_32RegClass); - int ScratchOffsetFI = -1; - - if (ScratchOffsetReg != AMDGPU::NoRegister) { - // Found an SGPR to use - MRI.setPhysRegUsed(ScratchOffsetReg); - BuildMI(*Entry, I, DL, TII->get(AMDGPU::S_MOV_B32), ScratchOffsetReg) - .addReg(ScratchOffsetPreloadReg); - } else { - // No SGPR is available, we must spill. - ScratchOffsetFI = FrameInfo->CreateSpillStackObject(4,4); - BuildMI(*Entry, I, DL, TII->get(AMDGPU::SI_SPILL_S32_SAVE)) - .addReg(ScratchOffsetPreloadReg) - .addFrameIndex(ScratchOffsetFI) - .addReg(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, RegState::Undef) - .addReg(AMDGPU::SGPR0, RegState::Undef); - } - - - // Now that we have the scratch pointer and offset values, we need to - // add them to all the SI_SPILL_V* instructions. - - RegScavenger RS; - unsigned ScratchRsrcFI = FrameInfo->CreateSpillStackObject(16, 4); - RS.addScavengingFrameIndex(ScratchRsrcFI); - - for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); - BI != BE; ++BI) { - - MachineBasicBlock &MBB = *BI; - // Add the scratch offset reg as a live-in so that the register scavenger - // doesn't re-use it. - if (!MBB.isLiveIn(ScratchOffsetReg) && - ScratchOffsetReg != AMDGPU::NoRegister) - MBB.addLiveIn(ScratchOffsetReg); - RS.enterBasicBlock(&MBB); - - for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); - I != E; ++I) { - MachineInstr &MI = *I; - RS.forward(I); - DebugLoc DL = MI.getDebugLoc(); - if (!TII->isVGPRSpill(MI.getOpcode())) - continue; - - // Scratch resource - unsigned ScratchRsrcReg = - RS.scavengeRegister(&AMDGPU::SReg_128RegClass, 0); - - uint64_t Rsrc = AMDGPU::RSRC_DATA_FORMAT | AMDGPU::RSRC_TID_ENABLE | - 0xffffffff; // Size - - unsigned Rsrc0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0); - unsigned Rsrc1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1); - unsigned Rsrc2 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2); - unsigned Rsrc3 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3); - - BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), Rsrc0) - .addExternalSymbol("SCRATCH_RSRC_DWORD0") - .addReg(ScratchRsrcReg, RegState::ImplicitDefine); - - BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), Rsrc1) - .addExternalSymbol("SCRATCH_RSRC_DWORD1") - .addReg(ScratchRsrcReg, RegState::ImplicitDefine); - - BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), Rsrc2) - .addImm(Rsrc & 0xffffffff) - .addReg(ScratchRsrcReg, RegState::ImplicitDefine); - - BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), Rsrc3) - .addImm(Rsrc >> 32) - .addReg(ScratchRsrcReg, RegState::ImplicitDefine); - - // Scratch Offset - if (ScratchOffsetReg == AMDGPU::NoRegister) { - ScratchOffsetReg = RS.scavengeRegister(&AMDGPU::SGPR_32RegClass, 0); - BuildMI(MBB, I, DL, TII->get(AMDGPU::SI_SPILL_S32_RESTORE), - ScratchOffsetReg) - .addFrameIndex(ScratchOffsetFI) - .addReg(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, RegState::Undef) - .addReg(AMDGPU::SGPR0, RegState::Undef); - } else if (!MBB.isLiveIn(ScratchOffsetReg)) { - MBB.addLiveIn(ScratchOffsetReg); - } - - if (ScratchRsrcReg == AMDGPU::NoRegister || - ScratchOffsetReg == AMDGPU::NoRegister) { - LLVMContext &Ctx = MF.getFunction()->getContext(); - Ctx.emitError("ran out of SGPRs for spilling VGPRs"); - ScratchRsrcReg = AMDGPU::SGPR0; - ScratchOffsetReg = AMDGPU::SGPR0; - } - MI.getOperand(2).setReg(ScratchRsrcReg); - MI.getOperand(2).setIsKill(true); - MI.getOperand(2).setIsUndef(false); - MI.getOperand(3).setReg(ScratchOffsetReg); - MI.getOperand(3).setIsUndef(false); - MI.getOperand(3).setIsKill(false); - MI.addOperand(MachineOperand::CreateReg(Rsrc0, false, true, true)); - MI.addOperand(MachineOperand::CreateReg(Rsrc1, false, true, true)); - MI.addOperand(MachineOperand::CreateReg(Rsrc2, false, true, true)); - MI.addOperand(MachineOperand::CreateReg(Rsrc3, false, true, true)); - } - } - return true; -} Property changes on: projects/clang370-import/contrib/llvm/lib/Target/R600/SIPrepareScratchRegs.cpp ___________________________________________________________________ Deleted: svn:eol-style ## -1 +0,0 ## -native \ No newline at end of property Deleted: svn:keywords ## -1 +0,0 ## -FreeBSD=%H \ No newline at end of property Deleted: svn:mime-type ## -1 +0,0 ## -text/plain \ No newline at end of property Index: projects/clang370-import/contrib/llvm/lib/Target/R600/R600MachineScheduler.cpp =================================================================== --- projects/clang370-import/contrib/llvm/lib/Target/R600/R600MachineScheduler.cpp (revision 288077) +++ projects/clang370-import/contrib/llvm/lib/Target/R600/R600MachineScheduler.cpp (nonexistent) @@ -1,469 +0,0 @@ -//===-- R600MachineScheduler.cpp - R600 Scheduler Interface -*- C++ -*-----===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -/// \file -/// \brief R600 Machine Scheduler interface -// -//===----------------------------------------------------------------------===// - -#include "R600MachineScheduler.h" -#include "AMDGPUSubtarget.h" -#include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/Pass.h" -#include "llvm/IR/LegacyPassManager.h" -#include "llvm/Support/raw_ostream.h" - -using namespace llvm; - -#define DEBUG_TYPE "misched" - -void R600SchedStrategy::initialize(ScheduleDAGMI *dag) { - assert(dag->hasVRegLiveness() && "R600SchedStrategy needs vreg liveness"); - DAG = static_cast(dag); - const AMDGPUSubtarget &ST = DAG->MF.getSubtarget(); - TII = static_cast(DAG->TII); - TRI = static_cast(DAG->TRI); - VLIW5 = !ST.hasCaymanISA(); - MRI = &DAG->MRI; - CurInstKind = IDOther; - CurEmitted = 0; - OccupedSlotsMask = 31; - InstKindLimit[IDAlu] = TII->getMaxAlusPerClause(); - InstKindLimit[IDOther] = 32; - InstKindLimit[IDFetch] = ST.getTexVTXClauseSize(); - AluInstCount = 0; - FetchInstCount = 0; -} - -void R600SchedStrategy::MoveUnits(std::vector &QSrc, - std::vector &QDst) -{ - QDst.insert(QDst.end(), QSrc.begin(), QSrc.end()); - QSrc.clear(); -} - -static -unsigned getWFCountLimitedByGPR(unsigned GPRCount) { - assert (GPRCount && "GPRCount cannot be 0"); - return 248 / GPRCount; -} - -SUnit* R600SchedStrategy::pickNode(bool &IsTopNode) { - SUnit *SU = nullptr; - NextInstKind = IDOther; - - IsTopNode = false; - - // check if we might want to switch current clause type - bool AllowSwitchToAlu = (CurEmitted >= InstKindLimit[CurInstKind]) || - (Available[CurInstKind].empty()); - bool AllowSwitchFromAlu = (CurEmitted >= InstKindLimit[CurInstKind]) && - (!Available[IDFetch].empty() || !Available[IDOther].empty()); - - if (CurInstKind == IDAlu && !Available[IDFetch].empty()) { - // We use the heuristic provided by AMD Accelerated Parallel Processing - // OpenCL Programming Guide : - // The approx. number of WF that allows TEX inst to hide ALU inst is : - // 500 (cycles for TEX) / (AluFetchRatio * 8 (cycles for ALU)) - float ALUFetchRationEstimate = - (AluInstCount + AvailablesAluCount() + Pending[IDAlu].size()) / - (FetchInstCount + Available[IDFetch].size()); - if (ALUFetchRationEstimate == 0) { - AllowSwitchFromAlu = true; - } else { - unsigned NeededWF = 62.5f / ALUFetchRationEstimate; - DEBUG( dbgs() << NeededWF << " approx. Wavefronts Required\n" ); - // We assume the local GPR requirements to be "dominated" by the requirement - // of the TEX clause (which consumes 128 bits regs) ; ALU inst before and - // after TEX are indeed likely to consume or generate values from/for the - // TEX clause. - // Available[IDFetch].size() * 2 : GPRs required in the Fetch clause - // We assume that fetch instructions are either TnXYZW = TEX TnXYZW (need - // one GPR) or TmXYZW = TnXYZW (need 2 GPR). - // (TODO : use RegisterPressure) - // If we are going too use too many GPR, we flush Fetch instruction to lower - // register pressure on 128 bits regs. - unsigned NearRegisterRequirement = 2 * Available[IDFetch].size(); - if (NeededWF > getWFCountLimitedByGPR(NearRegisterRequirement)) - AllowSwitchFromAlu = true; - } - } - - if (!SU && ((AllowSwitchToAlu && CurInstKind != IDAlu) || - (!AllowSwitchFromAlu && CurInstKind == IDAlu))) { - // try to pick ALU - SU = pickAlu(); - if (!SU && !PhysicalRegCopy.empty()) { - SU = PhysicalRegCopy.front(); - PhysicalRegCopy.erase(PhysicalRegCopy.begin()); - } - if (SU) { - if (CurEmitted >= InstKindLimit[IDAlu]) - CurEmitted = 0; - NextInstKind = IDAlu; - } - } - - if (!SU) { - // try to pick FETCH - SU = pickOther(IDFetch); - if (SU) - NextInstKind = IDFetch; - } - - // try to pick other - if (!SU) { - SU = pickOther(IDOther); - if (SU) - NextInstKind = IDOther; - } - - DEBUG( - if (SU) { - dbgs() << " ** Pick node **\n"; - SU->dump(DAG); - } else { - dbgs() << "NO NODE \n"; - for (unsigned i = 0; i < DAG->SUnits.size(); i++) { - const SUnit &S = DAG->SUnits[i]; - if (!S.isScheduled) - S.dump(DAG); - } - } - ); - - return SU; -} - -void R600SchedStrategy::schedNode(SUnit *SU, bool IsTopNode) { - if (NextInstKind != CurInstKind) { - DEBUG(dbgs() << "Instruction Type Switch\n"); - if (NextInstKind != IDAlu) - OccupedSlotsMask |= 31; - CurEmitted = 0; - CurInstKind = NextInstKind; - } - - if (CurInstKind == IDAlu) { - AluInstCount ++; - switch (getAluKind(SU)) { - case AluT_XYZW: - CurEmitted += 4; - break; - case AluDiscarded: - break; - default: { - ++CurEmitted; - for (MachineInstr::mop_iterator It = SU->getInstr()->operands_begin(), - E = SU->getInstr()->operands_end(); It != E; ++It) { - MachineOperand &MO = *It; - if (MO.isReg() && MO.getReg() == AMDGPU::ALU_LITERAL_X) - ++CurEmitted; - } - } - } - } else { - ++CurEmitted; - } - - - DEBUG(dbgs() << CurEmitted << " Instructions Emitted in this clause\n"); - - if (CurInstKind != IDFetch) { - MoveUnits(Pending[IDFetch], Available[IDFetch]); - } else - FetchInstCount++; -} - -static bool -isPhysicalRegCopy(MachineInstr *MI) { - if (MI->getOpcode() != AMDGPU::COPY) - return false; - - return !TargetRegisterInfo::isVirtualRegister(MI->getOperand(1).getReg()); -} - -void R600SchedStrategy::releaseTopNode(SUnit *SU) { - DEBUG(dbgs() << "Top Releasing ";SU->dump(DAG);); -} - -void R600SchedStrategy::releaseBottomNode(SUnit *SU) { - DEBUG(dbgs() << "Bottom Releasing ";SU->dump(DAG);); - if (isPhysicalRegCopy(SU->getInstr())) { - PhysicalRegCopy.push_back(SU); - return; - } - - int IK = getInstKind(SU); - - // There is no export clause, we can schedule one as soon as its ready - if (IK == IDOther) - Available[IDOther].push_back(SU); - else - Pending[IK].push_back(SU); - -} - -bool R600SchedStrategy::regBelongsToClass(unsigned Reg, - const TargetRegisterClass *RC) const { - if (!TargetRegisterInfo::isVirtualRegister(Reg)) { - return RC->contains(Reg); - } else { - return MRI->getRegClass(Reg) == RC; - } -} - -R600SchedStrategy::AluKind R600SchedStrategy::getAluKind(SUnit *SU) const { - MachineInstr *MI = SU->getInstr(); - - if (TII->isTransOnly(MI)) - return AluTrans; - - switch (MI->getOpcode()) { - case AMDGPU::PRED_X: - return AluPredX; - case AMDGPU::INTERP_PAIR_XY: - case AMDGPU::INTERP_PAIR_ZW: - case AMDGPU::INTERP_VEC_LOAD: - case AMDGPU::DOT_4: - return AluT_XYZW; - case AMDGPU::COPY: - if (MI->getOperand(1).isUndef()) { - // MI will become a KILL, don't considers it in scheduling - return AluDiscarded; - } - default: - break; - } - - // Does the instruction take a whole IG ? - // XXX: Is it possible to add a helper function in R600InstrInfo that can - // be used here and in R600PacketizerList::isSoloInstruction() ? - if(TII->isVector(*MI) || - TII->isCubeOp(MI->getOpcode()) || - TII->isReductionOp(MI->getOpcode()) || - MI->getOpcode() == AMDGPU::GROUP_BARRIER) { - return AluT_XYZW; - } - - if (TII->isLDSInstr(MI->getOpcode())) { - return AluT_X; - } - - // Is the result already assigned to a channel ? - unsigned DestSubReg = MI->getOperand(0).getSubReg(); - switch (DestSubReg) { - case AMDGPU::sub0: - return AluT_X; - case AMDGPU::sub1: - return AluT_Y; - case AMDGPU::sub2: - return AluT_Z; - case AMDGPU::sub3: - return AluT_W; - default: - break; - } - - // Is the result already member of a X/Y/Z/W class ? - unsigned DestReg = MI->getOperand(0).getReg(); - if (regBelongsToClass(DestReg, &AMDGPU::R600_TReg32_XRegClass) || - regBelongsToClass(DestReg, &AMDGPU::R600_AddrRegClass)) - return AluT_X; - if (regBelongsToClass(DestReg, &AMDGPU::R600_TReg32_YRegClass)) - return AluT_Y; - if (regBelongsToClass(DestReg, &AMDGPU::R600_TReg32_ZRegClass)) - return AluT_Z; - if (regBelongsToClass(DestReg, &AMDGPU::R600_TReg32_WRegClass)) - return AluT_W; - if (regBelongsToClass(DestReg, &AMDGPU::R600_Reg128RegClass)) - return AluT_XYZW; - - // LDS src registers cannot be used in the Trans slot. - if (TII->readsLDSSrcReg(MI)) - return AluT_XYZW; - - return AluAny; - -} - -int R600SchedStrategy::getInstKind(SUnit* SU) { - int Opcode = SU->getInstr()->getOpcode(); - - if (TII->usesTextureCache(Opcode) || TII->usesVertexCache(Opcode)) - return IDFetch; - - if (TII->isALUInstr(Opcode)) { - return IDAlu; - } - - switch (Opcode) { - case AMDGPU::PRED_X: - case AMDGPU::COPY: - case AMDGPU::CONST_COPY: - case AMDGPU::INTERP_PAIR_XY: - case AMDGPU::INTERP_PAIR_ZW: - case AMDGPU::INTERP_VEC_LOAD: - case AMDGPU::DOT_4: - return IDAlu; - default: - return IDOther; - } -} - -SUnit *R600SchedStrategy::PopInst(std::vector &Q, bool AnyALU) { - if (Q.empty()) - return nullptr; - for (std::vector::reverse_iterator It = Q.rbegin(), E = Q.rend(); - It != E; ++It) { - SUnit *SU = *It; - InstructionsGroupCandidate.push_back(SU->getInstr()); - if (TII->fitsConstReadLimitations(InstructionsGroupCandidate) - && (!AnyALU || !TII->isVectorOnly(SU->getInstr())) - ) { - InstructionsGroupCandidate.pop_back(); - Q.erase((It + 1).base()); - return SU; - } else { - InstructionsGroupCandidate.pop_back(); - } - } - return nullptr; -} - -void R600SchedStrategy::LoadAlu() { - std::vector &QSrc = Pending[IDAlu]; - for (unsigned i = 0, e = QSrc.size(); i < e; ++i) { - AluKind AK = getAluKind(QSrc[i]); - AvailableAlus[AK].push_back(QSrc[i]); - } - QSrc.clear(); -} - -void R600SchedStrategy::PrepareNextSlot() { - DEBUG(dbgs() << "New Slot\n"); - assert (OccupedSlotsMask && "Slot wasn't filled"); - OccupedSlotsMask = 0; -// if (HwGen == AMDGPUSubtarget::NORTHERN_ISLANDS) -// OccupedSlotsMask |= 16; - InstructionsGroupCandidate.clear(); - LoadAlu(); -} - -void R600SchedStrategy::AssignSlot(MachineInstr* MI, unsigned Slot) { - int DstIndex = TII->getOperandIdx(MI->getOpcode(), AMDGPU::OpName::dst); - if (DstIndex == -1) { - return; - } - unsigned DestReg = MI->getOperand(DstIndex).getReg(); - // PressureRegister crashes if an operand is def and used in the same inst - // and we try to constraint its regclass - for (MachineInstr::mop_iterator It = MI->operands_begin(), - E = MI->operands_end(); It != E; ++It) { - MachineOperand &MO = *It; - if (MO.isReg() && !MO.isDef() && - MO.getReg() == DestReg) - return; - } - // Constrains the regclass of DestReg to assign it to Slot - switch (Slot) { - case 0: - MRI->constrainRegClass(DestReg, &AMDGPU::R600_TReg32_XRegClass); - break; - case 1: - MRI->constrainRegClass(DestReg, &AMDGPU::R600_TReg32_YRegClass); - break; - case 2: - MRI->constrainRegClass(DestReg, &AMDGPU::R600_TReg32_ZRegClass); - break; - case 3: - MRI->constrainRegClass(DestReg, &AMDGPU::R600_TReg32_WRegClass); - break; - } -} - -SUnit *R600SchedStrategy::AttemptFillSlot(unsigned Slot, bool AnyAlu) { - static const AluKind IndexToID[] = {AluT_X, AluT_Y, AluT_Z, AluT_W}; - SUnit *SlotedSU = PopInst(AvailableAlus[IndexToID[Slot]], AnyAlu); - if (SlotedSU) - return SlotedSU; - SUnit *UnslotedSU = PopInst(AvailableAlus[AluAny], AnyAlu); - if (UnslotedSU) - AssignSlot(UnslotedSU->getInstr(), Slot); - return UnslotedSU; -} - -unsigned R600SchedStrategy::AvailablesAluCount() const { - return AvailableAlus[AluAny].size() + AvailableAlus[AluT_XYZW].size() + - AvailableAlus[AluT_X].size() + AvailableAlus[AluT_Y].size() + - AvailableAlus[AluT_Z].size() + AvailableAlus[AluT_W].size() + - AvailableAlus[AluTrans].size() + AvailableAlus[AluDiscarded].size() + - AvailableAlus[AluPredX].size(); -} - -SUnit* R600SchedStrategy::pickAlu() { - while (AvailablesAluCount() || !Pending[IDAlu].empty()) { - if (!OccupedSlotsMask) { - // Bottom up scheduling : predX must comes first - if (!AvailableAlus[AluPredX].empty()) { - OccupedSlotsMask |= 31; - return PopInst(AvailableAlus[AluPredX], false); - } - // Flush physical reg copies (RA will discard them) - if (!AvailableAlus[AluDiscarded].empty()) { - OccupedSlotsMask |= 31; - return PopInst(AvailableAlus[AluDiscarded], false); - } - // If there is a T_XYZW alu available, use it - if (!AvailableAlus[AluT_XYZW].empty()) { - OccupedSlotsMask |= 15; - return PopInst(AvailableAlus[AluT_XYZW], false); - } - } - bool TransSlotOccuped = OccupedSlotsMask & 16; - if (!TransSlotOccuped && VLIW5) { - if (!AvailableAlus[AluTrans].empty()) { - OccupedSlotsMask |= 16; - return PopInst(AvailableAlus[AluTrans], false); - } - SUnit *SU = AttemptFillSlot(3, true); - if (SU) { - OccupedSlotsMask |= 16; - return SU; - } - } - for (int Chan = 3; Chan > -1; --Chan) { - bool isOccupied = OccupedSlotsMask & (1 << Chan); - if (!isOccupied) { - SUnit *SU = AttemptFillSlot(Chan, false); - if (SU) { - OccupedSlotsMask |= (1 << Chan); - InstructionsGroupCandidate.push_back(SU->getInstr()); - return SU; - } - } - } - PrepareNextSlot(); - } - return nullptr; -} - -SUnit* R600SchedStrategy::pickOther(int QID) { - SUnit *SU = nullptr; - std::vector &AQ = Available[QID]; - - if (AQ.empty()) { - MoveUnits(Pending[QID], AQ); - } - if (!AQ.empty()) { - SU = AQ.back(); - AQ.resize(AQ.size() - 1); - } - return SU; -} Index: projects/clang370-import/contrib/llvm/lib/Target/R600/AMDGPURegisterInfo.td =================================================================== --- projects/clang370-import/contrib/llvm/lib/Target/R600/AMDGPURegisterInfo.td (revision 288077) +++ projects/clang370-import/contrib/llvm/lib/Target/R600/AMDGPURegisterInfo.td (nonexistent) @@ -1,26 +0,0 @@ -//===-- AMDGPURegisterInfo.td - AMDGPU register info -------*- tablegen -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// Tablegen register definitions common to all hw codegen targets. -// -//===----------------------------------------------------------------------===// - -let Namespace = "AMDGPU" in { - -foreach Index = 0-15 in { - // Indices are used in a variety of ways here, so don't set a size/offset. - def sub#Index : SubRegIndex<-1, -1>; -} - -def INDIRECT_BASE_ADDR : Register <"INDIRECT_BASE_ADDR">; - -} - -include "R600RegisterInfo.td" -include "SIRegisterInfo.td" Index: projects/clang370-import/contrib/llvm/lib/Target/R600/AMDGPUMachineFunction.h =================================================================== --- projects/clang370-import/contrib/llvm/lib/Target/R600/AMDGPUMachineFunction.h (revision 288077) +++ projects/clang370-import/contrib/llvm/lib/Target/R600/AMDGPUMachineFunction.h (nonexistent) @@ -1,45 +0,0 @@ -//===-- R600MachineFunctionInfo.h - R600 Machine Function Info ----*- C++ -*-=// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -/// \file -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIB_TARGET_R600_AMDGPUMACHINEFUNCTION_H -#define LLVM_LIB_TARGET_R600_AMDGPUMACHINEFUNCTION_H - -#include "llvm/CodeGen/MachineFunction.h" -#include - -namespace llvm { - -class AMDGPUMachineFunction : public MachineFunctionInfo { - virtual void anchor(); - unsigned ShaderType; - -public: - AMDGPUMachineFunction(const MachineFunction &MF); - /// A map to keep track of local memory objects and their offsets within - /// the local memory space. - std::map LocalMemoryObjects; - /// Number of bytes in the LDS that are being used. - unsigned LDSSize; - - /// Start of implicit kernel args - unsigned ABIArgOffset; - - unsigned getShaderType() const { - return ShaderType; - } - - unsigned ScratchSize; - bool IsKernel; -}; - -} -#endif Index: projects/clang370-import/contrib/llvm/lib/Target/R600/R600ISelLowering.h =================================================================== --- projects/clang370-import/contrib/llvm/lib/Target/R600/R600ISelLowering.h (revision 288077) +++ projects/clang370-import/contrib/llvm/lib/Target/R600/R600ISelLowering.h (nonexistent) @@ -1,80 +0,0 @@ -//===-- R600ISelLowering.h - R600 DAG Lowering Interface -*- C++ -*--------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -/// \file -/// \brief R600 DAG Lowering interface definition -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIB_TARGET_R600_R600ISELLOWERING_H -#define LLVM_LIB_TARGET_R600_R600ISELLOWERING_H - -#include "AMDGPUISelLowering.h" - -namespace llvm { - -class R600InstrInfo; - -class R600TargetLowering : public AMDGPUTargetLowering { -public: - R600TargetLowering(TargetMachine &TM, const AMDGPUSubtarget &STI); - MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr *MI, - MachineBasicBlock * BB) const override; - SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override; - SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override; - void ReplaceNodeResults(SDNode * N, - SmallVectorImpl &Results, - SelectionDAG &DAG) const override; - SDValue LowerFormalArguments( - SDValue Chain, - CallingConv::ID CallConv, - bool isVarArg, - const SmallVectorImpl &Ins, - SDLoc DL, SelectionDAG &DAG, - SmallVectorImpl &InVals) const override; - EVT getSetCCResultType(LLVMContext &, EVT VT) const override; -private: - unsigned Gen; - /// Each OpenCL kernel has nine implicit parameters that are stored in the - /// first nine dwords of a Vertex Buffer. These implicit parameters are - /// lowered to load instructions which retrieve the values from the Vertex - /// Buffer. - SDValue LowerImplicitParameter(SelectionDAG &DAG, EVT VT, - SDLoc DL, unsigned DwordOffset) const; - - void lowerImplicitParameter(MachineInstr *MI, MachineBasicBlock &BB, - MachineRegisterInfo & MRI, unsigned dword_offset) const; - SDValue OptimizeSwizzle(SDValue BuildVector, SDValue Swz[], SelectionDAG &DAG, - SDLoc DL) const; - SDValue vectorToVerticalVector(SelectionDAG &DAG, SDValue Vector) const; - - SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerTrig(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerSHLParts(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerSRXParts(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerUADDSUBO(SDValue Op, SelectionDAG &DAG, - unsigned mainop, unsigned ovf) const; - - SDValue stackPtrToRegIndex(SDValue Ptr, unsigned StackWidth, - SelectionDAG &DAG) const; - void getStackAddress(unsigned StackWidth, unsigned ElemIdx, - unsigned &Channel, unsigned &PtrIncr) const; - bool isZero(SDValue Op) const; - SDNode *PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const override; -}; - -} // End namespace llvm; - -#endif Index: projects/clang370-import/contrib/llvm/lib/Target/R600/AMDGPUInstrInfo.cpp =================================================================== --- projects/clang370-import/contrib/llvm/lib/Target/R600/AMDGPUInstrInfo.cpp (revision 288077) +++ projects/clang370-import/contrib/llvm/lib/Target/R600/AMDGPUInstrInfo.cpp (nonexistent) @@ -1,370 +0,0 @@ -//===-- AMDGPUInstrInfo.cpp - Base class for AMD GPU InstrInfo ------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -/// \file -/// \brief Implementation of the TargetInstrInfo class that is common to all -/// AMD GPUs. -// -//===----------------------------------------------------------------------===// - -#include "AMDGPUInstrInfo.h" -#include "AMDGPURegisterInfo.h" -#include "AMDGPUTargetMachine.h" -#include "llvm/CodeGen/MachineFrameInfo.h" -#include "llvm/CodeGen/MachineInstrBuilder.h" -#include "llvm/CodeGen/MachineRegisterInfo.h" - -using namespace llvm; - -#define GET_INSTRINFO_CTOR_DTOR -#define GET_INSTRINFO_NAMED_OPS -#define GET_INSTRMAP_INFO -#include "AMDGPUGenInstrInfo.inc" - -// Pin the vtable to this file. -void AMDGPUInstrInfo::anchor() {} - -AMDGPUInstrInfo::AMDGPUInstrInfo(const AMDGPUSubtarget &st) - : AMDGPUGenInstrInfo(-1, -1), ST(st) {} - -const AMDGPURegisterInfo &AMDGPUInstrInfo::getRegisterInfo() const { - return RI; -} - -bool AMDGPUInstrInfo::isCoalescableExtInstr(const MachineInstr &MI, - unsigned &SrcReg, unsigned &DstReg, - unsigned &SubIdx) const { -// TODO: Implement this function - return false; -} - -unsigned AMDGPUInstrInfo::isLoadFromStackSlot(const MachineInstr *MI, - int &FrameIndex) const { -// TODO: Implement this function - return 0; -} - -unsigned AMDGPUInstrInfo::isLoadFromStackSlotPostFE(const MachineInstr *MI, - int &FrameIndex) const { -// TODO: Implement this function - return 0; -} - -bool AMDGPUInstrInfo::hasLoadFromStackSlot(const MachineInstr *MI, - const MachineMemOperand *&MMO, - int &FrameIndex) const { -// TODO: Implement this function - return false; -} -unsigned AMDGPUInstrInfo::isStoreFromStackSlot(const MachineInstr *MI, - int &FrameIndex) const { -// TODO: Implement this function - return 0; -} -unsigned AMDGPUInstrInfo::isStoreFromStackSlotPostFE(const MachineInstr *MI, - int &FrameIndex) const { -// TODO: Implement this function - return 0; -} -bool AMDGPUInstrInfo::hasStoreFromStackSlot(const MachineInstr *MI, - const MachineMemOperand *&MMO, - int &FrameIndex) const { -// TODO: Implement this function - return false; -} - -MachineInstr * -AMDGPUInstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, - MachineBasicBlock::iterator &MBBI, - LiveVariables *LV) const { -// TODO: Implement this function - return nullptr; -} - -void -AMDGPUInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MI, - unsigned SrcReg, bool isKill, - int FrameIndex, - const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI) const { - llvm_unreachable("Not Implemented"); -} - -void -AMDGPUInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MI, - unsigned DestReg, int FrameIndex, - const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI) const { - llvm_unreachable("Not Implemented"); -} - -bool AMDGPUInstrInfo::expandPostRAPseudo (MachineBasicBlock::iterator MI) const { - MachineBasicBlock *MBB = MI->getParent(); - int OffsetOpIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), - AMDGPU::OpName::addr); - // addr is a custom operand with multiple MI operands, and only the - // first MI operand is given a name. - int RegOpIdx = OffsetOpIdx + 1; - int ChanOpIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), - AMDGPU::OpName::chan); - if (isRegisterLoad(*MI)) { - int DstOpIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), - AMDGPU::OpName::dst); - unsigned RegIndex = MI->getOperand(RegOpIdx).getImm(); - unsigned Channel = MI->getOperand(ChanOpIdx).getImm(); - unsigned Address = calculateIndirectAddress(RegIndex, Channel); - unsigned OffsetReg = MI->getOperand(OffsetOpIdx).getReg(); - if (OffsetReg == AMDGPU::INDIRECT_BASE_ADDR) { - buildMovInstr(MBB, MI, MI->getOperand(DstOpIdx).getReg(), - getIndirectAddrRegClass()->getRegister(Address)); - } else { - buildIndirectRead(MBB, MI, MI->getOperand(DstOpIdx).getReg(), - Address, OffsetReg); - } - } else if (isRegisterStore(*MI)) { - int ValOpIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), - AMDGPU::OpName::val); - unsigned RegIndex = MI->getOperand(RegOpIdx).getImm(); - unsigned Channel = MI->getOperand(ChanOpIdx).getImm(); - unsigned Address = calculateIndirectAddress(RegIndex, Channel); - unsigned OffsetReg = MI->getOperand(OffsetOpIdx).getReg(); - if (OffsetReg == AMDGPU::INDIRECT_BASE_ADDR) { - buildMovInstr(MBB, MI, getIndirectAddrRegClass()->getRegister(Address), - MI->getOperand(ValOpIdx).getReg()); - } else { - buildIndirectWrite(MBB, MI, MI->getOperand(ValOpIdx).getReg(), - calculateIndirectAddress(RegIndex, Channel), - OffsetReg); - } - } else { - return false; - } - - MBB->erase(MI); - return true; -} - -MachineInstr *AMDGPUInstrInfo::foldMemoryOperandImpl( - MachineFunction &MF, MachineInstr *MI, ArrayRef Ops, - MachineBasicBlock::iterator InsertPt, int FrameIndex) const { -// TODO: Implement this function - return nullptr; -} -MachineInstr *AMDGPUInstrInfo::foldMemoryOperandImpl( - MachineFunction &MF, MachineInstr *MI, ArrayRef Ops, - MachineBasicBlock::iterator InsertPt, MachineInstr *LoadMI) const { - // TODO: Implement this function - return nullptr; -} -bool AMDGPUInstrInfo::canFoldMemoryOperand(const MachineInstr *MI, - ArrayRef Ops) const { - // TODO: Implement this function - return false; -} -bool -AMDGPUInstrInfo::unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI, - unsigned Reg, bool UnfoldLoad, - bool UnfoldStore, - SmallVectorImpl &NewMIs) const { - // TODO: Implement this function - return false; -} - -bool -AMDGPUInstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N, - SmallVectorImpl &NewNodes) const { - // TODO: Implement this function - return false; -} - -unsigned -AMDGPUInstrInfo::getOpcodeAfterMemoryUnfold(unsigned Opc, - bool UnfoldLoad, bool UnfoldStore, - unsigned *LoadRegIndex) const { - // TODO: Implement this function - return 0; -} - -bool AMDGPUInstrInfo::enableClusterLoads() const { - return true; -} - -// FIXME: This behaves strangely. If, for example, you have 32 load + stores, -// the first 16 loads will be interleaved with the stores, and the next 16 will -// be clustered as expected. It should really split into 2 16 store batches. -// -// Loads are clustered until this returns false, rather than trying to schedule -// groups of stores. This also means we have to deal with saying different -// address space loads should be clustered, and ones which might cause bank -// conflicts. -// -// This might be deprecated so it might not be worth that much effort to fix. -bool AMDGPUInstrInfo::shouldScheduleLoadsNear(SDNode *Load0, SDNode *Load1, - int64_t Offset0, int64_t Offset1, - unsigned NumLoads) const { - assert(Offset1 > Offset0 && - "Second offset should be larger than first offset!"); - // If we have less than 16 loads in a row, and the offsets are within 64 - // bytes, then schedule together. - - // A cacheline is 64 bytes (for global memory). - return (NumLoads <= 16 && (Offset1 - Offset0) < 64); -} - -bool -AMDGPUInstrInfo::ReverseBranchCondition(SmallVectorImpl &Cond) - const { - // TODO: Implement this function - return true; -} -void AMDGPUInstrInfo::insertNoop(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MI) const { - // TODO: Implement this function -} - -bool AMDGPUInstrInfo::isPredicated(const MachineInstr *MI) const { - // TODO: Implement this function - return false; -} -bool -AMDGPUInstrInfo::SubsumesPredicate(const SmallVectorImpl &Pred1, - const SmallVectorImpl &Pred2) - const { - // TODO: Implement this function - return false; -} - -bool AMDGPUInstrInfo::DefinesPredicate(MachineInstr *MI, - std::vector &Pred) const { - // TODO: Implement this function - return false; -} - -bool AMDGPUInstrInfo::isPredicable(MachineInstr *MI) const { - // TODO: Implement this function - return MI->getDesc().isPredicable(); -} - -bool -AMDGPUInstrInfo::isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const { - // TODO: Implement this function - return true; -} - -bool AMDGPUInstrInfo::isRegisterStore(const MachineInstr &MI) const { - return get(MI.getOpcode()).TSFlags & AMDGPU_FLAG_REGISTER_STORE; -} - -bool AMDGPUInstrInfo::isRegisterLoad(const MachineInstr &MI) const { - return get(MI.getOpcode()).TSFlags & AMDGPU_FLAG_REGISTER_LOAD; -} - -int AMDGPUInstrInfo::getIndirectIndexBegin(const MachineFunction &MF) const { - const MachineRegisterInfo &MRI = MF.getRegInfo(); - const MachineFrameInfo *MFI = MF.getFrameInfo(); - int Offset = -1; - - if (MFI->getNumObjects() == 0) { - return -1; - } - - if (MRI.livein_empty()) { - return 0; - } - - const TargetRegisterClass *IndirectRC = getIndirectAddrRegClass(); - for (MachineRegisterInfo::livein_iterator LI = MRI.livein_begin(), - LE = MRI.livein_end(); - LI != LE; ++LI) { - unsigned Reg = LI->first; - if (TargetRegisterInfo::isVirtualRegister(Reg) || - !IndirectRC->contains(Reg)) - continue; - - unsigned RegIndex; - unsigned RegEnd; - for (RegIndex = 0, RegEnd = IndirectRC->getNumRegs(); RegIndex != RegEnd; - ++RegIndex) { - if (IndirectRC->getRegister(RegIndex) == Reg) - break; - } - Offset = std::max(Offset, (int)RegIndex); - } - - return Offset + 1; -} - -int AMDGPUInstrInfo::getIndirectIndexEnd(const MachineFunction &MF) const { - int Offset = 0; - const MachineFrameInfo *MFI = MF.getFrameInfo(); - - // Variable sized objects are not supported - assert(!MFI->hasVarSizedObjects()); - - if (MFI->getNumObjects() == 0) { - return -1; - } - - Offset = MF.getSubtarget().getFrameLowering()->getFrameIndexOffset(MF, -1); - - return getIndirectIndexBegin(MF) + Offset; -} - -int AMDGPUInstrInfo::getMaskedMIMGOp(uint16_t Opcode, unsigned Channels) const { - switch (Channels) { - default: return Opcode; - case 1: return AMDGPU::getMaskedMIMGOp(Opcode, AMDGPU::Channels_1); - case 2: return AMDGPU::getMaskedMIMGOp(Opcode, AMDGPU::Channels_2); - case 3: return AMDGPU::getMaskedMIMGOp(Opcode, AMDGPU::Channels_3); - } -} - -// Wrapper for Tablegen'd function. enum Subtarget is not defined in any -// header files, so we need to wrap it in a function that takes unsigned -// instead. -namespace llvm { -namespace AMDGPU { -static int getMCOpcode(uint16_t Opcode, unsigned Gen) { - return getMCOpcodeGen(Opcode, (enum Subtarget)Gen); -} -} -} - -// This must be kept in sync with the SISubtarget class in SIInstrInfo.td -enum SISubtarget { - SI = 0, - VI = 1 -}; - -static enum SISubtarget AMDGPUSubtargetToSISubtarget(unsigned Gen) { - switch (Gen) { - default: - return SI; - case AMDGPUSubtarget::VOLCANIC_ISLANDS: - return VI; - } -} - -int AMDGPUInstrInfo::pseudoToMCOpcode(int Opcode) const { - int MCOp = AMDGPU::getMCOpcode( - Opcode, AMDGPUSubtargetToSISubtarget(ST.getGeneration())); - - // -1 means that Opcode is already a native instruction. - if (MCOp == -1) - return Opcode; - - // (uint16_t)-1 means that Opcode is a pseudo instruction that has - // no encoding in the given subtarget generation. - if (MCOp == (uint16_t)-1) - return -1; - - return MCOp; -} Index: projects/clang370-import/contrib/llvm/lib/Target/R600/TargetInfo/AMDGPUTargetInfo.cpp =================================================================== --- projects/clang370-import/contrib/llvm/lib/Target/R600/TargetInfo/AMDGPUTargetInfo.cpp (revision 288077) +++ projects/clang370-import/contrib/llvm/lib/Target/R600/TargetInfo/AMDGPUTargetInfo.cpp (nonexistent) @@ -1,30 +0,0 @@ -//===-- TargetInfo/AMDGPUTargetInfo.cpp - TargetInfo for AMDGPU -----------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -/// \file -// -//===----------------------------------------------------------------------===// - -#include "AMDGPUTargetMachine.h" -#include "llvm/Support/TargetRegistry.h" - -using namespace llvm; - -/// \brief The target which suports all AMD GPUs. This will eventually -/// be deprecated and there will be a R600 target and a GCN target. -Target llvm::TheAMDGPUTarget; -/// \brief The target for GCN GPUs -Target llvm::TheGCNTarget; - -/// \brief Extern function to initialize the targets for the AMDGPU backend -extern "C" void LLVMInitializeR600TargetInfo() { - RegisterTarget - R600(TheAMDGPUTarget, "r600", "AMD GPUs HD2XXX-HD6XXX"); - RegisterTarget GCN(TheGCNTarget, "amdgcn", "AMD GCN GPUs"); -} Index: projects/clang370-import/contrib/llvm/lib/Target/R600/SIFixControlFlowLiveIntervals.cpp =================================================================== --- projects/clang370-import/contrib/llvm/lib/Target/R600/SIFixControlFlowLiveIntervals.cpp (revision 288077) +++ projects/clang370-import/contrib/llvm/lib/Target/R600/SIFixControlFlowLiveIntervals.cpp (nonexistent) @@ -1,96 +0,0 @@ -//===-- SIFixControlFlowLiveIntervals.cpp - Fix CF live intervals ---------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -/// \file -/// \brief Spilling of EXEC masks used for control flow messes up control flow -/// lowering, so mark all live intervals associated with CF instructions as -/// non-spillable. -/// -//===----------------------------------------------------------------------===// - -#include "AMDGPU.h" -#include "SIInstrInfo.h" -#include "SIRegisterInfo.h" -#include "llvm/CodeGen/LiveIntervalAnalysis.h" -#include "llvm/CodeGen/MachineFunctionPass.h" -#include "llvm/CodeGen/MachineInstrBuilder.h" -#include "llvm/CodeGen/MachinePostDominators.h" -#include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/Support/Debug.h" -#include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetMachine.h" - -using namespace llvm; - -#define DEBUG_TYPE "si-fix-cf-live-intervals" - -namespace { - -class SIFixControlFlowLiveIntervals : public MachineFunctionPass { -public: - static char ID; - -public: - SIFixControlFlowLiveIntervals() : MachineFunctionPass(ID) { - initializeSIFixControlFlowLiveIntervalsPass(*PassRegistry::getPassRegistry()); - } - - bool runOnMachineFunction(MachineFunction &MF) override; - - const char *getPassName() const override { - return "SI Fix CF Live Intervals"; - } - - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired(); - AU.setPreservesAll(); - MachineFunctionPass::getAnalysisUsage(AU); - } -}; - -} // End anonymous namespace. - -INITIALIZE_PASS_BEGIN(SIFixControlFlowLiveIntervals, DEBUG_TYPE, - "SI Fix CF Live Intervals", false, false) -INITIALIZE_PASS_DEPENDENCY(LiveIntervals) -INITIALIZE_PASS_END(SIFixControlFlowLiveIntervals, DEBUG_TYPE, - "SI Fix CF Live Intervals", false, false) - -char SIFixControlFlowLiveIntervals::ID = 0; - -char &llvm::SIFixControlFlowLiveIntervalsID = SIFixControlFlowLiveIntervals::ID; - -FunctionPass *llvm::createSIFixControlFlowLiveIntervalsPass() { - return new SIFixControlFlowLiveIntervals(); -} - -bool SIFixControlFlowLiveIntervals::runOnMachineFunction(MachineFunction &MF) { - LiveIntervals *LIS = &getAnalysis(); - - for (const MachineBasicBlock &MBB : MF) { - for (const MachineInstr &MI : MBB) { - switch (MI.getOpcode()) { - case AMDGPU::SI_IF: - case AMDGPU::SI_ELSE: - case AMDGPU::SI_BREAK: - case AMDGPU::SI_IF_BREAK: - case AMDGPU::SI_ELSE_BREAK: - case AMDGPU::SI_END_CF: { - unsigned Reg = MI.getOperand(0).getReg(); - LIS->getInterval(Reg).markNotSpillable(); - break; - } - default: - break; - } - } - } - - return false; -} Property changes on: projects/clang370-import/contrib/llvm/lib/Target/R600/SIFixControlFlowLiveIntervals.cpp ___________________________________________________________________ Deleted: svn:eol-style ## -1 +0,0 ## -native \ No newline at end of property Deleted: svn:keywords ## -1 +0,0 ## -FreeBSD=%H \ No newline at end of property Deleted: svn:mime-type ## -1 +0,0 ## -text/plain \ No newline at end of property Index: projects/clang370-import/contrib/llvm/lib/Target/R600/AMDGPUAsmPrinter.h =================================================================== --- projects/clang370-import/contrib/llvm/lib/Target/R600/AMDGPUAsmPrinter.h (revision 288077) +++ projects/clang370-import/contrib/llvm/lib/Target/R600/AMDGPUAsmPrinter.h (nonexistent) @@ -1,113 +0,0 @@ -//===-- AMDGPUAsmPrinter.h - Print AMDGPU assembly code ---------*- C++ -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -/// \file -/// \brief AMDGPU Assembly printer class. -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIB_TARGET_R600_AMDGPUASMPRINTER_H -#define LLVM_LIB_TARGET_R600_AMDGPUASMPRINTER_H - -#include "llvm/CodeGen/AsmPrinter.h" -#include - -namespace llvm { - -class AMDGPUAsmPrinter : public AsmPrinter { -private: - struct SIProgramInfo { - SIProgramInfo() : - VGPRBlocks(0), - SGPRBlocks(0), - Priority(0), - FloatMode(0), - Priv(0), - DX10Clamp(0), - DebugMode(0), - IEEEMode(0), - ScratchSize(0), - ComputePGMRSrc1(0), - LDSBlocks(0), - ScratchBlocks(0), - ComputePGMRSrc2(0), - NumVGPR(0), - NumSGPR(0), - FlatUsed(false), - VCCUsed(false), - CodeLen(0) {} - - // Fields set in PGM_RSRC1 pm4 packet. - uint32_t VGPRBlocks; - uint32_t SGPRBlocks; - uint32_t Priority; - uint32_t FloatMode; - uint32_t Priv; - uint32_t DX10Clamp; - uint32_t DebugMode; - uint32_t IEEEMode; - uint32_t ScratchSize; - - uint64_t ComputePGMRSrc1; - - // Fields set in PGM_RSRC2 pm4 packet. - uint32_t LDSBlocks; - uint32_t ScratchBlocks; - - uint64_t ComputePGMRSrc2; - - uint32_t NumVGPR; - uint32_t NumSGPR; - uint32_t LDSSize; - bool FlatUsed; - - // Bonus information for debugging. - bool VCCUsed; - uint64_t CodeLen; - }; - - void getSIProgramInfo(SIProgramInfo &Out, const MachineFunction &MF) const; - void findNumUsedRegistersSI(const MachineFunction &MF, - unsigned &NumSGPR, - unsigned &NumVGPR) const; - - /// \brief Emit register usage information so that the GPU driver - /// can correctly setup the GPU state. - void EmitProgramInfoR600(const MachineFunction &MF); - void EmitProgramInfoSI(const MachineFunction &MF, const SIProgramInfo &KernelInfo); - void EmitAmdKernelCodeT(const MachineFunction &MF, - const SIProgramInfo &KernelInfo) const; - -public: - explicit AMDGPUAsmPrinter(TargetMachine &TM, - std::unique_ptr Streamer); - - bool runOnMachineFunction(MachineFunction &MF) override; - - const char *getPassName() const override { - return "AMDGPU Assembly Printer"; - } - - /// Implemented in AMDGPUMCInstLower.cpp - void EmitInstruction(const MachineInstr *MI) override; - - void EmitEndOfAsmFile(Module &M) override; - - bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, - unsigned AsmVariant, const char *ExtraCode, - raw_ostream &O) override; - -protected: - std::vector DisasmLines, HexLines; - size_t DisasmLineMaxLen; -}; - -} // End anonymous llvm - -#endif Index: projects/clang370-import/contrib/llvm/lib/Target/R600/SIInstrInfo.h =================================================================== --- projects/clang370-import/contrib/llvm/lib/Target/R600/SIInstrInfo.h (revision 288077) +++ projects/clang370-import/contrib/llvm/lib/Target/R600/SIInstrInfo.h (nonexistent) @@ -1,391 +0,0 @@ -//===-- SIInstrInfo.h - SI Instruction Info Interface -----------*- C++ -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -/// \file -/// \brief Interface definition for SIInstrInfo. -// -//===----------------------------------------------------------------------===// - - -#ifndef LLVM_LIB_TARGET_R600_SIINSTRINFO_H -#define LLVM_LIB_TARGET_R600_SIINSTRINFO_H - -#include "AMDGPUInstrInfo.h" -#include "SIDefines.h" -#include "SIRegisterInfo.h" - -namespace llvm { - -class SIInstrInfo : public AMDGPUInstrInfo { -private: - const SIRegisterInfo RI; - - unsigned buildExtractSubReg(MachineBasicBlock::iterator MI, - MachineRegisterInfo &MRI, - MachineOperand &SuperReg, - const TargetRegisterClass *SuperRC, - unsigned SubIdx, - const TargetRegisterClass *SubRC) const; - MachineOperand buildExtractSubRegOrImm(MachineBasicBlock::iterator MI, - MachineRegisterInfo &MRI, - MachineOperand &SuperReg, - const TargetRegisterClass *SuperRC, - unsigned SubIdx, - const TargetRegisterClass *SubRC) const; - - unsigned split64BitImm(SmallVectorImpl &Worklist, - MachineBasicBlock::iterator MI, - MachineRegisterInfo &MRI, - const TargetRegisterClass *RC, - const MachineOperand &Op) const; - - void swapOperands(MachineBasicBlock::iterator Inst) const; - - void splitScalar64BitUnaryOp(SmallVectorImpl &Worklist, - MachineInstr *Inst, unsigned Opcode) const; - - void splitScalar64BitBinaryOp(SmallVectorImpl &Worklist, - MachineInstr *Inst, unsigned Opcode) const; - - void splitScalar64BitBCNT(SmallVectorImpl &Worklist, - MachineInstr *Inst) const; - void splitScalar64BitBFE(SmallVectorImpl &Worklist, - MachineInstr *Inst) const; - - void addDescImplicitUseDef(const MCInstrDesc &Desc, MachineInstr *MI) const; - - bool checkInstOffsetsDoNotOverlap(MachineInstr *MIa, - MachineInstr *MIb) const; - - unsigned findUsedSGPR(const MachineInstr *MI, int OpIndices[3]) const; - -public: - explicit SIInstrInfo(const AMDGPUSubtarget &st); - - const SIRegisterInfo &getRegisterInfo() const override { - return RI; - } - - bool isReallyTriviallyReMaterializable(const MachineInstr *MI, - AliasAnalysis *AA) const override; - - bool areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2, - int64_t &Offset1, - int64_t &Offset2) const override; - - bool getLdStBaseRegImmOfs(MachineInstr *LdSt, - unsigned &BaseReg, unsigned &Offset, - const TargetRegisterInfo *TRI) const final; - - bool shouldClusterLoads(MachineInstr *FirstLdSt, - MachineInstr *SecondLdSt, - unsigned NumLoads) const final; - - void copyPhysReg(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MI, DebugLoc DL, - unsigned DestReg, unsigned SrcReg, - bool KillSrc) const override; - - unsigned calculateLDSSpillAddress(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MI, - RegScavenger *RS, - unsigned TmpReg, - unsigned Offset, - unsigned Size) const; - - void storeRegToStackSlot(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MI, - unsigned SrcReg, bool isKill, int FrameIndex, - const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI) const override; - - void loadRegFromStackSlot(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MI, - unsigned DestReg, int FrameIndex, - const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI) const override; - - bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const override; - - // \brief Returns an opcode that can be used to move a value to a \p DstRC - // register. If there is no hardware instruction that can store to \p - // DstRC, then AMDGPU::COPY is returned. - unsigned getMovOpcode(const TargetRegisterClass *DstRC) const; - unsigned commuteOpcode(const MachineInstr &MI) const; - - MachineInstr *commuteInstruction(MachineInstr *MI, - bool NewMI = false) const override; - bool findCommutedOpIndices(MachineInstr *MI, - unsigned &SrcOpIdx1, - unsigned &SrcOpIdx2) const override; - - bool isTriviallyReMaterializable(const MachineInstr *MI, - AliasAnalysis *AA = nullptr) const; - - bool areMemAccessesTriviallyDisjoint( - MachineInstr *MIa, MachineInstr *MIb, - AliasAnalysis *AA = nullptr) const override; - - MachineInstr *buildMovInstr(MachineBasicBlock *MBB, - MachineBasicBlock::iterator I, - unsigned DstReg, unsigned SrcReg) const override; - bool isMov(unsigned Opcode) const override; - - bool isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const override; - - bool FoldImmediate(MachineInstr *UseMI, MachineInstr *DefMI, - unsigned Reg, MachineRegisterInfo *MRI) const final; - - unsigned getMachineCSELookAheadLimit() const override { return 500; } - - bool isSALU(uint16_t Opcode) const { - return get(Opcode).TSFlags & SIInstrFlags::SALU; - } - - bool isVALU(uint16_t Opcode) const { - return get(Opcode).TSFlags & SIInstrFlags::VALU; - } - - bool isSOP1(uint16_t Opcode) const { - return get(Opcode).TSFlags & SIInstrFlags::SOP1; - } - - bool isSOP2(uint16_t Opcode) const { - return get(Opcode).TSFlags & SIInstrFlags::SOP2; - } - - bool isSOPC(uint16_t Opcode) const { - return get(Opcode).TSFlags & SIInstrFlags::SOPC; - } - - bool isSOPK(uint16_t Opcode) const { - return get(Opcode).TSFlags & SIInstrFlags::SOPK; - } - - bool isSOPP(uint16_t Opcode) const { - return get(Opcode).TSFlags & SIInstrFlags::SOPP; - } - - bool isVOP1(uint16_t Opcode) const { - return get(Opcode).TSFlags & SIInstrFlags::VOP1; - } - - bool isVOP2(uint16_t Opcode) const { - return get(Opcode).TSFlags & SIInstrFlags::VOP2; - } - - bool isVOP3(uint16_t Opcode) const { - return get(Opcode).TSFlags & SIInstrFlags::VOP3; - } - - bool isVOPC(uint16_t Opcode) const { - return get(Opcode).TSFlags & SIInstrFlags::VOPC; - } - - bool isMUBUF(uint16_t Opcode) const { - return get(Opcode).TSFlags & SIInstrFlags::MUBUF; - } - - bool isMTBUF(uint16_t Opcode) const { - return get(Opcode).TSFlags & SIInstrFlags::MTBUF; - } - - bool isSMRD(uint16_t Opcode) const { - return get(Opcode).TSFlags & SIInstrFlags::SMRD; - } - - bool isDS(uint16_t Opcode) const { - return get(Opcode).TSFlags & SIInstrFlags::DS; - } - - bool isMIMG(uint16_t Opcode) const { - return get(Opcode).TSFlags & SIInstrFlags::MIMG; - } - - bool isFLAT(uint16_t Opcode) const { - return get(Opcode).TSFlags & SIInstrFlags::FLAT; - } - - bool isWQM(uint16_t Opcode) const { - return get(Opcode).TSFlags & SIInstrFlags::WQM; - } - - bool isVGPRSpill(uint16_t Opcode) const { - return get(Opcode).TSFlags & SIInstrFlags::VGPRSpill; - } - - bool isInlineConstant(const APInt &Imm) const; - bool isInlineConstant(const MachineOperand &MO, unsigned OpSize) const; - bool isLiteralConstant(const MachineOperand &MO, unsigned OpSize) const; - - bool isImmOperandLegal(const MachineInstr *MI, unsigned OpNo, - const MachineOperand &MO) const; - - /// \brief Return true if this 64-bit VALU instruction has a 32-bit encoding. - /// This function will return false if you pass it a 32-bit instruction. - bool hasVALU32BitEncoding(unsigned Opcode) const; - - /// \brief Returns true if this operand uses the constant bus. - bool usesConstantBus(const MachineRegisterInfo &MRI, - const MachineOperand &MO, - unsigned OpSize) const; - - /// \brief Return true if this instruction has any modifiers. - /// e.g. src[012]_mod, omod, clamp. - bool hasModifiers(unsigned Opcode) const; - - bool hasModifiersSet(const MachineInstr &MI, - unsigned OpName) const; - - bool verifyInstruction(const MachineInstr *MI, - StringRef &ErrInfo) const override; - - static unsigned getVALUOp(const MachineInstr &MI); - - bool isSALUOpSupportedOnVALU(const MachineInstr &MI) const; - - /// \brief Return the correct register class for \p OpNo. For target-specific - /// instructions, this will return the register class that has been defined - /// in tablegen. For generic instructions, like REG_SEQUENCE it will return - /// the register class of its machine operand. - /// to infer the correct register class base on the other operands. - const TargetRegisterClass *getOpRegClass(const MachineInstr &MI, - unsigned OpNo) const; - - /// \brief Return the size in bytes of the operand OpNo on the given - // instruction opcode. - unsigned getOpSize(uint16_t Opcode, unsigned OpNo) const { - const MCOperandInfo &OpInfo = get(Opcode).OpInfo[OpNo]; - - if (OpInfo.RegClass == -1) { - // If this is an immediate operand, this must be a 32-bit literal. - assert(OpInfo.OperandType == MCOI::OPERAND_IMMEDIATE); - return 4; - } - - return RI.getRegClass(OpInfo.RegClass)->getSize(); - } - - /// \brief This form should usually be preferred since it handles operands - /// with unknown register classes. - unsigned getOpSize(const MachineInstr &MI, unsigned OpNo) const { - return getOpRegClass(MI, OpNo)->getSize(); - } - - /// \returns true if it is legal for the operand at index \p OpNo - /// to read a VGPR. - bool canReadVGPR(const MachineInstr &MI, unsigned OpNo) const; - - /// \brief Legalize the \p OpIndex operand of this instruction by inserting - /// a MOV. For example: - /// ADD_I32_e32 VGPR0, 15 - /// to - /// MOV VGPR1, 15 - /// ADD_I32_e32 VGPR0, VGPR1 - /// - /// If the operand being legalized is a register, then a COPY will be used - /// instead of MOV. - void legalizeOpWithMove(MachineInstr *MI, unsigned OpIdx) const; - - /// \brief Check if \p MO is a legal operand if it was the \p OpIdx Operand - /// for \p MI. - bool isOperandLegal(const MachineInstr *MI, unsigned OpIdx, - const MachineOperand *MO = nullptr) const; - - /// \brief Legalize all operands in this instruction. This function may - /// create new instruction and insert them before \p MI. - void legalizeOperands(MachineInstr *MI) const; - - /// \brief Split an SMRD instruction into two smaller loads of half the - // size storing the results in \p Lo and \p Hi. - void splitSMRD(MachineInstr *MI, const TargetRegisterClass *HalfRC, - unsigned HalfImmOp, unsigned HalfSGPROp, - MachineInstr *&Lo, MachineInstr *&Hi) const; - - void moveSMRDToVALU(MachineInstr *MI, MachineRegisterInfo &MRI) const; - - /// \brief Replace this instruction's opcode with the equivalent VALU - /// opcode. This function will also move the users of \p MI to the - /// VALU if necessary. - void moveToVALU(MachineInstr &MI) const; - - unsigned calculateIndirectAddress(unsigned RegIndex, - unsigned Channel) const override; - - const TargetRegisterClass *getIndirectAddrRegClass() const override; - - MachineInstrBuilder buildIndirectWrite(MachineBasicBlock *MBB, - MachineBasicBlock::iterator I, - unsigned ValueReg, - unsigned Address, - unsigned OffsetReg) const override; - - MachineInstrBuilder buildIndirectRead(MachineBasicBlock *MBB, - MachineBasicBlock::iterator I, - unsigned ValueReg, - unsigned Address, - unsigned OffsetReg) const override; - void reserveIndirectRegisters(BitVector &Reserved, - const MachineFunction &MF) const; - - void LoadM0(MachineInstr *MoveRel, MachineBasicBlock::iterator I, - unsigned SavReg, unsigned IndexReg) const; - - void insertNOPs(MachineBasicBlock::iterator MI, int Count) const; - - /// \brief Returns the operand named \p Op. If \p MI does not have an - /// operand named \c Op, this function returns nullptr. - MachineOperand *getNamedOperand(MachineInstr &MI, unsigned OperandName) const; - - const MachineOperand *getNamedOperand(const MachineInstr &MI, - unsigned OpName) const { - return getNamedOperand(const_cast(MI), OpName); - } - - uint64_t getDefaultRsrcDataFormat() const; - -}; - -namespace AMDGPU { - - int getVOPe64(uint16_t Opcode); - int getVOPe32(uint16_t Opcode); - int getCommuteRev(uint16_t Opcode); - int getCommuteOrig(uint16_t Opcode); - int getAddr64Inst(uint16_t Opcode); - int getAtomicRetOp(uint16_t Opcode); - int getAtomicNoRetOp(uint16_t Opcode); - - const uint64_t RSRC_DATA_FORMAT = 0xf00000000000LL; - const uint64_t RSRC_TID_ENABLE = 1LL << 55; - -} // End namespace AMDGPU - -namespace SI { -namespace KernelInputOffsets { - -/// Offsets in bytes from the start of the input buffer -enum Offsets { - NGROUPS_X = 0, - NGROUPS_Y = 4, - NGROUPS_Z = 8, - GLOBAL_SIZE_X = 12, - GLOBAL_SIZE_Y = 16, - GLOBAL_SIZE_Z = 20, - LOCAL_SIZE_X = 24, - LOCAL_SIZE_Y = 28, - LOCAL_SIZE_Z = 32 -}; - -} // End namespace KernelInputOffsets -} // End namespace SI - -} // End namespace llvm - -#endif Index: projects/clang370-import/contrib/llvm/lib/Target/R600/AMDGPUIntrinsicInfo.h =================================================================== --- projects/clang370-import/contrib/llvm/lib/Target/R600/AMDGPUIntrinsicInfo.h (revision 288077) +++ projects/clang370-import/contrib/llvm/lib/Target/R600/AMDGPUIntrinsicInfo.h (nonexistent) @@ -1,48 +0,0 @@ -//===- AMDGPUIntrinsicInfo.h - AMDGPU Intrinsic Information ------*- C++ -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//==-----------------------------------------------------------------------===// -// -/// \file -/// \brief Interface for the AMDGPU Implementation of the Intrinsic Info class. -// -//===-----------------------------------------------------------------------===// -#ifndef LLVM_LIB_TARGET_R600_AMDGPUINTRINSICINFO_H -#define LLVM_LIB_TARGET_R600_AMDGPUINTRINSICINFO_H - -#include "llvm/IR/Intrinsics.h" -#include "llvm/Target/TargetIntrinsicInfo.h" - -namespace llvm { -class TargetMachine; - -namespace AMDGPUIntrinsic { -enum ID { - last_non_AMDGPU_intrinsic = Intrinsic::num_intrinsics - 1, -#define GET_INTRINSIC_ENUM_VALUES -#include "AMDGPUGenIntrinsics.inc" -#undef GET_INTRINSIC_ENUM_VALUES - , num_AMDGPU_intrinsics -}; - -} // end namespace AMDGPUIntrinsic - -class AMDGPUIntrinsicInfo : public TargetIntrinsicInfo { -public: - AMDGPUIntrinsicInfo(); - std::string getName(unsigned IntrId, Type **Tys = nullptr, - unsigned numTys = 0) const override; - unsigned lookupName(const char *Name, unsigned Len) const override; - bool isOverloaded(unsigned IID) const override; - Function *getDeclaration(Module *M, unsigned ID, - Type **Tys = nullptr, - unsigned numTys = 0) const override; -}; - -} // end namespace llvm - -#endif Index: projects/clang370-import/contrib/llvm/lib/Target/R600/CaymanInstructions.td =================================================================== --- projects/clang370-import/contrib/llvm/lib/Target/R600/CaymanInstructions.td (revision 288077) +++ projects/clang370-import/contrib/llvm/lib/Target/R600/CaymanInstructions.td (nonexistent) @@ -1,226 +0,0 @@ -//===-- CaymanInstructions.td - CM Instruction defs -------*- tablegen -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// TableGen definitions for instructions which are available only on Cayman -// family GPUs. -// -//===----------------------------------------------------------------------===// - -def isCayman : Predicate<"Subtarget->hasCaymanISA()">; - -//===----------------------------------------------------------------------===// -// Cayman Instructions -//===----------------------------------------------------------------------===// - -let Predicates = [isCayman] in { - -def MULADD_INT24_cm : R600_3OP <0x08, "MULADD_INT24", - [(set i32:$dst, (AMDGPUmad_i24 i32:$src0, i32:$src1, i32:$src2))], VecALU ->; -def MUL_INT24_cm : R600_2OP <0x5B, "MUL_INT24", - [(set i32:$dst, (AMDGPUmul_i24 i32:$src0, i32:$src1))], VecALU ->; - -def : IMad24Pat; - -let isVector = 1 in { - -def RECIP_IEEE_cm : RECIP_IEEE_Common<0x86>; - -def MULLO_INT_cm : MULLO_INT_Common<0x8F>; -def MULHI_INT_cm : MULHI_INT_Common<0x90>; -def MULLO_UINT_cm : MULLO_UINT_Common<0x91>; -def MULHI_UINT_cm : MULHI_UINT_Common<0x92>; -def RECIPSQRT_CLAMPED_cm : RECIPSQRT_CLAMPED_Common<0x87>; -def EXP_IEEE_cm : EXP_IEEE_Common<0x81>; -def LOG_IEEE_cm : LOG_IEEE_Common<0x83>; -def RECIP_CLAMPED_cm : RECIP_CLAMPED_Common<0x84>; -def RECIPSQRT_IEEE_cm : RECIPSQRT_IEEE_Common<0x89>; -def SIN_cm : SIN_Common<0x8D>; -def COS_cm : COS_Common<0x8E>; -} // End isVector = 1 - -def : RsqPat; - -def : POW_Common ; - -defm DIV_cm : DIV_Common; -defm : Expand24UBitOps; - -// RECIP_UINT emulation for Cayman -// The multiplication scales from [0,1] to the unsigned integer range -def : Pat < - (AMDGPUurecip i32:$src0), - (FLT_TO_UINT_eg (MUL_IEEE (RECIP_IEEE_cm (UINT_TO_FLT_eg $src0)), - (MOV_IMM_I32 CONST.FP_UINT_MAX_PLUS_1))) ->; - - def CF_END_CM : CF_CLAUSE_EG<32, (ins), "CF_END"> { - let ADDR = 0; - let POP_COUNT = 0; - let COUNT = 0; - } - - -def : Pat<(fsqrt f32:$src), (MUL R600_Reg32:$src, (RECIPSQRT_CLAMPED_cm $src))>; - -class RAT_STORE_DWORD mask> : - CF_MEM_RAT_CACHELESS <0x14, 0, mask, - (ins rc:$rw_gpr, R600_TReg32_X:$index_gpr), - "STORE_DWORD $rw_gpr, $index_gpr", - [(global_store vt:$rw_gpr, i32:$index_gpr)]> { - let eop = 0; // This bit is not used on Cayman. -} - -def RAT_STORE_DWORD32 : RAT_STORE_DWORD ; -def RAT_STORE_DWORD64 : RAT_STORE_DWORD ; -def RAT_STORE_DWORD128 : RAT_STORE_DWORD ; - -class VTX_READ_cm buffer_id, dag outs, list pattern> - : VTX_WORD0_cm, VTX_READ { - - // Static fields - let VC_INST = 0; - let FETCH_TYPE = 2; - let FETCH_WHOLE_QUAD = 0; - let BUFFER_ID = buffer_id; - let SRC_REL = 0; - // XXX: We can infer this field based on the SRC_GPR. This would allow us - // to store vertex addresses in any channel, not just X. - let SRC_SEL_X = 0; - let SRC_SEL_Y = 0; - let STRUCTURED_READ = 0; - let LDS_REQ = 0; - let COALESCED_READ = 0; - - let Inst{31-0} = Word0; -} - -class VTX_READ_8_cm buffer_id, list pattern> - : VTX_READ_cm <"VTX_READ_8 $dst_gpr, $src_gpr", buffer_id, - (outs R600_TReg32_X:$dst_gpr), pattern> { - - let DST_SEL_X = 0; - let DST_SEL_Y = 7; // Masked - let DST_SEL_Z = 7; // Masked - let DST_SEL_W = 7; // Masked - let DATA_FORMAT = 1; // FMT_8 -} - -class VTX_READ_16_cm buffer_id, list pattern> - : VTX_READ_cm <"VTX_READ_16 $dst_gpr, $src_gpr", buffer_id, - (outs R600_TReg32_X:$dst_gpr), pattern> { - let DST_SEL_X = 0; - let DST_SEL_Y = 7; // Masked - let DST_SEL_Z = 7; // Masked - let DST_SEL_W = 7; // Masked - let DATA_FORMAT = 5; // FMT_16 - -} - -class VTX_READ_32_cm buffer_id, list pattern> - : VTX_READ_cm <"VTX_READ_32 $dst_gpr, $src_gpr", buffer_id, - (outs R600_TReg32_X:$dst_gpr), pattern> { - - let DST_SEL_X = 0; - let DST_SEL_Y = 7; // Masked - let DST_SEL_Z = 7; // Masked - let DST_SEL_W = 7; // Masked - let DATA_FORMAT = 0xD; // COLOR_32 - - // This is not really necessary, but there were some GPU hangs that appeared - // to be caused by ALU instructions in the next instruction group that wrote - // to the $src_gpr registers of the VTX_READ. - // e.g. - // %T3_X = VTX_READ_PARAM_32_eg %T2_X, 24 - // %T2_X = MOV %ZERO - //Adding this constraint prevents this from happening. - let Constraints = "$src_gpr.ptr = $dst_gpr"; -} - -class VTX_READ_64_cm buffer_id, list pattern> - : VTX_READ_cm <"VTX_READ_64 $dst_gpr, $src_gpr", buffer_id, - (outs R600_Reg64:$dst_gpr), pattern> { - - let DST_SEL_X = 0; - let DST_SEL_Y = 1; - let DST_SEL_Z = 7; - let DST_SEL_W = 7; - let DATA_FORMAT = 0x1D; // COLOR_32_32 -} - -class VTX_READ_128_cm buffer_id, list pattern> - : VTX_READ_cm <"VTX_READ_128 $dst_gpr.XYZW, $src_gpr", buffer_id, - (outs R600_Reg128:$dst_gpr), pattern> { - - let DST_SEL_X = 0; - let DST_SEL_Y = 1; - let DST_SEL_Z = 2; - let DST_SEL_W = 3; - let DATA_FORMAT = 0x22; // COLOR_32_32_32_32 - - // XXX: Need to force VTX_READ_128 instructions to write to the same register - // that holds its buffer address to avoid potential hangs. We can't use - // the same constraint as VTX_READ_32_eg, because the $src_gpr.ptr and $dst - // registers are different sizes. -} - -//===----------------------------------------------------------------------===// -// VTX Read from parameter memory space -//===----------------------------------------------------------------------===// -def VTX_READ_PARAM_8_cm : VTX_READ_8_cm <0, - [(set i32:$dst_gpr, (load_param_exti8 ADDRVTX_READ:$src_gpr))] ->; - -def VTX_READ_PARAM_16_cm : VTX_READ_16_cm <0, - [(set i32:$dst_gpr, (load_param_exti16 ADDRVTX_READ:$src_gpr))] ->; - -def VTX_READ_PARAM_32_cm : VTX_READ_32_cm <0, - [(set i32:$dst_gpr, (load_param ADDRVTX_READ:$src_gpr))] ->; - -def VTX_READ_PARAM_64_cm : VTX_READ_64_cm <0, - [(set v2i32:$dst_gpr, (load_param ADDRVTX_READ:$src_gpr))] ->; - -def VTX_READ_PARAM_128_cm : VTX_READ_128_cm <0, - [(set v4i32:$dst_gpr, (load_param ADDRVTX_READ:$src_gpr))] ->; - -//===----------------------------------------------------------------------===// -// VTX Read from global memory space -//===----------------------------------------------------------------------===// - -// 8-bit reads -def VTX_READ_GLOBAL_8_cm : VTX_READ_8_cm <1, - [(set i32:$dst_gpr, (az_extloadi8_global ADDRVTX_READ:$src_gpr))] ->; - -def VTX_READ_GLOBAL_16_cm : VTX_READ_16_cm <1, - [(set i32:$dst_gpr, (az_extloadi16_global ADDRVTX_READ:$src_gpr))] ->; - -// 32-bit reads -def VTX_READ_GLOBAL_32_cm : VTX_READ_32_cm <1, - [(set i32:$dst_gpr, (global_load ADDRVTX_READ:$src_gpr))] ->; - -// 64-bit reads -def VTX_READ_GLOBAL_64_cm : VTX_READ_64_cm <1, - [(set v2i32:$dst_gpr, (global_load ADDRVTX_READ:$src_gpr))] ->; - -// 128-bit reads -def VTX_READ_GLOBAL_128_cm : VTX_READ_128_cm <1, - [(set v4i32:$dst_gpr, (global_load ADDRVTX_READ:$src_gpr))] ->; - -} // End isCayman - Index: projects/clang370-import/contrib/llvm/lib/Target/R600/SIAnnotateControlFlow.cpp =================================================================== --- projects/clang370-import/contrib/llvm/lib/Target/R600/SIAnnotateControlFlow.cpp (revision 288077) +++ projects/clang370-import/contrib/llvm/lib/Target/R600/SIAnnotateControlFlow.cpp (nonexistent) @@ -1,365 +0,0 @@ -//===-- SIAnnotateControlFlow.cpp - ------------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -/// \file -/// Annotates the control flow with hardware specific intrinsics. -// -//===----------------------------------------------------------------------===// - -#include "AMDGPU.h" -#include "llvm/ADT/DepthFirstIterator.h" -#include "llvm/Analysis/LoopInfo.h" -#include "llvm/IR/Constants.h" -#include "llvm/IR/Dominators.h" -#include "llvm/IR/Instructions.h" -#include "llvm/IR/Module.h" -#include "llvm/Pass.h" -#include "llvm/Transforms/Utils/BasicBlockUtils.h" -#include "llvm/Transforms/Utils/SSAUpdater.h" - -using namespace llvm; - -#define DEBUG_TYPE "si-annotate-control-flow" - -namespace { - -// Complex types used in this pass -typedef std::pair StackEntry; -typedef SmallVector StackVector; - -// Intrinsic names the control flow is annotated with -static const char *const IfIntrinsic = "llvm.SI.if"; -static const char *const ElseIntrinsic = "llvm.SI.else"; -static const char *const BreakIntrinsic = "llvm.SI.break"; -static const char *const IfBreakIntrinsic = "llvm.SI.if.break"; -static const char *const ElseBreakIntrinsic = "llvm.SI.else.break"; -static const char *const LoopIntrinsic = "llvm.SI.loop"; -static const char *const EndCfIntrinsic = "llvm.SI.end.cf"; - -class SIAnnotateControlFlow : public FunctionPass { - - static char ID; - - Type *Boolean; - Type *Void; - Type *Int64; - Type *ReturnStruct; - - ConstantInt *BoolTrue; - ConstantInt *BoolFalse; - UndefValue *BoolUndef; - Constant *Int64Zero; - - Constant *If; - Constant *Else; - Constant *Break; - Constant *IfBreak; - Constant *ElseBreak; - Constant *Loop; - Constant *EndCf; - - DominatorTree *DT; - StackVector Stack; - - LoopInfo *LI; - - bool isTopOfStack(BasicBlock *BB); - - Value *popSaved(); - - void push(BasicBlock *BB, Value *Saved); - - bool isElse(PHINode *Phi); - - void eraseIfUnused(PHINode *Phi); - - void openIf(BranchInst *Term); - - void insertElse(BranchInst *Term); - - Value *handleLoopCondition(Value *Cond, PHINode *Broken, llvm::Loop *L); - - void handleLoop(BranchInst *Term); - - void closeControlFlow(BasicBlock *BB); - -public: - SIAnnotateControlFlow(): - FunctionPass(ID) { } - - bool doInitialization(Module &M) override; - - bool runOnFunction(Function &F) override; - - const char *getPassName() const override { - return "SI annotate control flow"; - } - - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired(); - AU.addRequired(); - AU.addPreserved(); - FunctionPass::getAnalysisUsage(AU); - } - -}; - -} // end anonymous namespace - -char SIAnnotateControlFlow::ID = 0; - -/// \brief Initialize all the types and constants used in the pass -bool SIAnnotateControlFlow::doInitialization(Module &M) { - LLVMContext &Context = M.getContext(); - - Void = Type::getVoidTy(Context); - Boolean = Type::getInt1Ty(Context); - Int64 = Type::getInt64Ty(Context); - ReturnStruct = StructType::get(Boolean, Int64, (Type *)nullptr); - - BoolTrue = ConstantInt::getTrue(Context); - BoolFalse = ConstantInt::getFalse(Context); - BoolUndef = UndefValue::get(Boolean); - Int64Zero = ConstantInt::get(Int64, 0); - - If = M.getOrInsertFunction( - IfIntrinsic, ReturnStruct, Boolean, (Type *)nullptr); - - Else = M.getOrInsertFunction( - ElseIntrinsic, ReturnStruct, Int64, (Type *)nullptr); - - Break = M.getOrInsertFunction( - BreakIntrinsic, Int64, Int64, (Type *)nullptr); - - IfBreak = M.getOrInsertFunction( - IfBreakIntrinsic, Int64, Boolean, Int64, (Type *)nullptr); - - ElseBreak = M.getOrInsertFunction( - ElseBreakIntrinsic, Int64, Int64, Int64, (Type *)nullptr); - - Loop = M.getOrInsertFunction( - LoopIntrinsic, Boolean, Int64, (Type *)nullptr); - - EndCf = M.getOrInsertFunction( - EndCfIntrinsic, Void, Int64, (Type *)nullptr); - - return false; -} - -/// \brief Is BB the last block saved on the stack ? -bool SIAnnotateControlFlow::isTopOfStack(BasicBlock *BB) { - return !Stack.empty() && Stack.back().first == BB; -} - -/// \brief Pop the last saved value from the control flow stack -Value *SIAnnotateControlFlow::popSaved() { - return Stack.pop_back_val().second; -} - -/// \brief Push a BB and saved value to the control flow stack -void SIAnnotateControlFlow::push(BasicBlock *BB, Value *Saved) { - Stack.push_back(std::make_pair(BB, Saved)); -} - -/// \brief Can the condition represented by this PHI node treated like -/// an "Else" block? -bool SIAnnotateControlFlow::isElse(PHINode *Phi) { - BasicBlock *IDom = DT->getNode(Phi->getParent())->getIDom()->getBlock(); - for (unsigned i = 0, e = Phi->getNumIncomingValues(); i != e; ++i) { - if (Phi->getIncomingBlock(i) == IDom) { - - if (Phi->getIncomingValue(i) != BoolTrue) - return false; - - } else { - if (Phi->getIncomingValue(i) != BoolFalse) - return false; - - } - } - return true; -} - -// \brief Erase "Phi" if it is not used any more -void SIAnnotateControlFlow::eraseIfUnused(PHINode *Phi) { - if (!Phi->hasNUsesOrMore(1)) - Phi->eraseFromParent(); -} - -/// \brief Open a new "If" block -void SIAnnotateControlFlow::openIf(BranchInst *Term) { - Value *Ret = CallInst::Create(If, Term->getCondition(), "", Term); - Term->setCondition(ExtractValueInst::Create(Ret, 0, "", Term)); - push(Term->getSuccessor(1), ExtractValueInst::Create(Ret, 1, "", Term)); -} - -/// \brief Close the last "If" block and open a new "Else" block -void SIAnnotateControlFlow::insertElse(BranchInst *Term) { - Value *Ret = CallInst::Create(Else, popSaved(), "", Term); - Term->setCondition(ExtractValueInst::Create(Ret, 0, "", Term)); - push(Term->getSuccessor(1), ExtractValueInst::Create(Ret, 1, "", Term)); -} - -/// \brief Recursively handle the condition leading to a loop -Value *SIAnnotateControlFlow::handleLoopCondition(Value *Cond, PHINode *Broken, - llvm::Loop *L) { - - // Only search through PHI nodes which are inside the loop. If we try this - // with PHI nodes that are outside of the loop, we end up inserting new PHI - // nodes outside of the loop which depend on values defined inside the loop. - // This will break the module with - // 'Instruction does not dominate all users!' errors. - PHINode *Phi = nullptr; - if ((Phi = dyn_cast(Cond)) && L->contains(Phi)) { - - BasicBlock *Parent = Phi->getParent(); - PHINode *NewPhi = PHINode::Create(Int64, 0, "", &Parent->front()); - Value *Ret = NewPhi; - - // Handle all non-constant incoming values first - for (unsigned i = 0, e = Phi->getNumIncomingValues(); i != e; ++i) { - Value *Incoming = Phi->getIncomingValue(i); - BasicBlock *From = Phi->getIncomingBlock(i); - if (isa(Incoming)) { - NewPhi->addIncoming(Broken, From); - continue; - } - - Phi->setIncomingValue(i, BoolFalse); - Value *PhiArg = handleLoopCondition(Incoming, Broken, L); - NewPhi->addIncoming(PhiArg, From); - } - - BasicBlock *IDom = DT->getNode(Parent)->getIDom()->getBlock(); - - for (unsigned i = 0, e = Phi->getNumIncomingValues(); i != e; ++i) { - - Value *Incoming = Phi->getIncomingValue(i); - if (Incoming != BoolTrue) - continue; - - BasicBlock *From = Phi->getIncomingBlock(i); - if (From == IDom) { - CallInst *OldEnd = dyn_cast(Parent->getFirstInsertionPt()); - if (OldEnd && OldEnd->getCalledFunction() == EndCf) { - Value *Args[] = { OldEnd->getArgOperand(0), NewPhi }; - Ret = CallInst::Create(ElseBreak, Args, "", OldEnd); - continue; - } - } - TerminatorInst *Insert = From->getTerminator(); - Value *PhiArg = CallInst::Create(Break, Broken, "", Insert); - NewPhi->setIncomingValue(i, PhiArg); - } - eraseIfUnused(Phi); - return Ret; - - } else if (Instruction *Inst = dyn_cast(Cond)) { - BasicBlock *Parent = Inst->getParent(); - Instruction *Insert; - if (L->contains(Inst)) { - Insert = Parent->getTerminator(); - } else { - Insert = L->getHeader()->getFirstNonPHIOrDbgOrLifetime(); - } - Value *Args[] = { Cond, Broken }; - return CallInst::Create(IfBreak, Args, "", Insert); - - } else { - llvm_unreachable("Unhandled loop condition!"); - } - return 0; -} - -/// \brief Handle a back edge (loop) -void SIAnnotateControlFlow::handleLoop(BranchInst *Term) { - BasicBlock *BB = Term->getParent(); - llvm::Loop *L = LI->getLoopFor(BB); - BasicBlock *Target = Term->getSuccessor(1); - PHINode *Broken = PHINode::Create(Int64, 0, "", &Target->front()); - - Value *Cond = Term->getCondition(); - Term->setCondition(BoolTrue); - Value *Arg = handleLoopCondition(Cond, Broken, L); - - for (pred_iterator PI = pred_begin(Target), PE = pred_end(Target); - PI != PE; ++PI) { - - Broken->addIncoming(*PI == BB ? Arg : Int64Zero, *PI); - } - - Term->setCondition(CallInst::Create(Loop, Arg, "", Term)); - push(Term->getSuccessor(0), Arg); -}/// \brief Close the last opened control flow -void SIAnnotateControlFlow::closeControlFlow(BasicBlock *BB) { - llvm::Loop *L = LI->getLoopFor(BB); - - if (L && L->getHeader() == BB) { - // We can't insert an EndCF call into a loop header, because it will - // get executed on every iteration of the loop, when it should be - // executed only once before the loop. - SmallVector Latches; - L->getLoopLatches(Latches); - - std::vector Preds; - for (pred_iterator PI = pred_begin(BB), PE = pred_end(BB); PI != PE; ++PI) { - if (std::find(Latches.begin(), Latches.end(), *PI) == Latches.end()) - Preds.push_back(*PI); - } - BB = llvm::SplitBlockPredecessors(BB, Preds, "endcf.split", nullptr, DT, - LI, false); - } - - CallInst::Create(EndCf, popSaved(), "", BB->getFirstInsertionPt()); -} - -/// \brief Annotate the control flow with intrinsics so the backend can -/// recognize if/then/else and loops. -bool SIAnnotateControlFlow::runOnFunction(Function &F) { - DT = &getAnalysis().getDomTree(); - LI = &getAnalysis().getLoopInfo(); - - for (df_iterator I = df_begin(&F.getEntryBlock()), - E = df_end(&F.getEntryBlock()); I != E; ++I) { - - BranchInst *Term = dyn_cast((*I)->getTerminator()); - - if (!Term || Term->isUnconditional()) { - if (isTopOfStack(*I)) - closeControlFlow(*I); - continue; - } - - if (I.nodeVisited(Term->getSuccessor(1))) { - if (isTopOfStack(*I)) - closeControlFlow(*I); - handleLoop(Term); - continue; - } - - if (isTopOfStack(*I)) { - PHINode *Phi = dyn_cast(Term->getCondition()); - if (Phi && Phi->getParent() == *I && isElse(Phi)) { - insertElse(Term); - eraseIfUnused(Phi); - continue; - } - closeControlFlow(*I); - } - openIf(Term); - } - - assert(Stack.empty()); - return true; -} - -/// \brief Create the annotation pass -FunctionPass *llvm::createSIAnnotateControlFlowPass() { - return new SIAnnotateControlFlow(); -} Index: projects/clang370-import/contrib/llvm/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp =================================================================== --- projects/clang370-import/contrib/llvm/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp (revision 288077) +++ projects/clang370-import/contrib/llvm/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp (nonexistent) @@ -1,642 +0,0 @@ -//===-- AMDGPUInstPrinter.cpp - AMDGPU MC Inst -> ASM ---------------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -// \file -//===----------------------------------------------------------------------===// - -#include "AMDGPUInstPrinter.h" -#include "MCTargetDesc/AMDGPUMCTargetDesc.h" -#include "SIDefines.h" -#include "llvm/MC/MCExpr.h" -#include "llvm/MC/MCInst.h" -#include "llvm/MC/MCInstrInfo.h" -#include "llvm/MC/MCRegisterInfo.h" -#include "llvm/Support/MathExtras.h" - -using namespace llvm; - -void AMDGPUInstPrinter::printInst(const MCInst *MI, raw_ostream &OS, - StringRef Annot, const MCSubtargetInfo &STI) { - OS.flush(); - printInstruction(MI, OS); - - printAnnotation(OS, Annot); -} - -void AMDGPUInstPrinter::printU8ImmOperand(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { - O << formatHex(MI->getOperand(OpNo).getImm() & 0xff); -} - -void AMDGPUInstPrinter::printU16ImmOperand(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { - O << formatHex(MI->getOperand(OpNo).getImm() & 0xffff); -} - -void AMDGPUInstPrinter::printU32ImmOperand(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { - O << formatHex(MI->getOperand(OpNo).getImm() & 0xffffffff); -} - -void AMDGPUInstPrinter::printU8ImmDecOperand(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { - O << formatDec(MI->getOperand(OpNo).getImm() & 0xff); -} - -void AMDGPUInstPrinter::printU16ImmDecOperand(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { - O << formatDec(MI->getOperand(OpNo).getImm() & 0xffff); -} - -void AMDGPUInstPrinter::printOffen(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { - if (MI->getOperand(OpNo).getImm()) - O << " offen"; -} - -void AMDGPUInstPrinter::printIdxen(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { - if (MI->getOperand(OpNo).getImm()) - O << " idxen"; -} - -void AMDGPUInstPrinter::printAddr64(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { - if (MI->getOperand(OpNo).getImm()) - O << " addr64"; -} - -void AMDGPUInstPrinter::printMBUFOffset(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { - if (MI->getOperand(OpNo).getImm()) { - O << " offset:"; - printU16ImmDecOperand(MI, OpNo, O); - } -} - -void AMDGPUInstPrinter::printDSOffset(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { - uint16_t Imm = MI->getOperand(OpNo).getImm(); - if (Imm != 0) { - O << " offset:"; - printU16ImmDecOperand(MI, OpNo, O); - } -} - -void AMDGPUInstPrinter::printDSOffset0(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { - if (MI->getOperand(OpNo).getImm()) { - O << " offset0:"; - printU8ImmDecOperand(MI, OpNo, O); - } -} - -void AMDGPUInstPrinter::printDSOffset1(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { - if (MI->getOperand(OpNo).getImm()) { - O << " offset1:"; - printU8ImmDecOperand(MI, OpNo, O); - } -} - -void AMDGPUInstPrinter::printGDS(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { - if (MI->getOperand(OpNo).getImm()) - O << " gds"; -} - -void AMDGPUInstPrinter::printGLC(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { - if (MI->getOperand(OpNo).getImm()) - O << " glc"; -} - -void AMDGPUInstPrinter::printSLC(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { - if (MI->getOperand(OpNo).getImm()) - O << " slc"; -} - -void AMDGPUInstPrinter::printTFE(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { - if (MI->getOperand(OpNo).getImm()) - O << " tfe"; -} - -void AMDGPUInstPrinter::printRegOperand(unsigned reg, raw_ostream &O, - const MCRegisterInfo &MRI) { - switch (reg) { - case AMDGPU::VCC: - O << "vcc"; - return; - case AMDGPU::SCC: - O << "scc"; - return; - case AMDGPU::EXEC: - O << "exec"; - return; - case AMDGPU::M0: - O << "m0"; - return; - case AMDGPU::FLAT_SCR: - O << "flat_scratch"; - return; - case AMDGPU::VCC_LO: - O << "vcc_lo"; - return; - case AMDGPU::VCC_HI: - O << "vcc_hi"; - return; - case AMDGPU::EXEC_LO: - O << "exec_lo"; - return; - case AMDGPU::EXEC_HI: - O << "exec_hi"; - return; - case AMDGPU::FLAT_SCR_LO: - O << "flat_scratch_lo"; - return; - case AMDGPU::FLAT_SCR_HI: - O << "flat_scratch_hi"; - return; - default: - break; - } - - char Type; - unsigned NumRegs; - - if (MRI.getRegClass(AMDGPU::VGPR_32RegClassID).contains(reg)) { - Type = 'v'; - NumRegs = 1; - } else if (MRI.getRegClass(AMDGPU::SGPR_32RegClassID).contains(reg)) { - Type = 's'; - NumRegs = 1; - } else if (MRI.getRegClass(AMDGPU::VReg_64RegClassID).contains(reg)) { - Type = 'v'; - NumRegs = 2; - } else if (MRI.getRegClass(AMDGPU::SReg_64RegClassID).contains(reg)) { - Type = 's'; - NumRegs = 2; - } else if (MRI.getRegClass(AMDGPU::VReg_128RegClassID).contains(reg)) { - Type = 'v'; - NumRegs = 4; - } else if (MRI.getRegClass(AMDGPU::SReg_128RegClassID).contains(reg)) { - Type = 's'; - NumRegs = 4; - } else if (MRI.getRegClass(AMDGPU::VReg_96RegClassID).contains(reg)) { - Type = 'v'; - NumRegs = 3; - } else if (MRI.getRegClass(AMDGPU::VReg_256RegClassID).contains(reg)) { - Type = 'v'; - NumRegs = 8; - } else if (MRI.getRegClass(AMDGPU::SReg_256RegClassID).contains(reg)) { - Type = 's'; - NumRegs = 8; - } else if (MRI.getRegClass(AMDGPU::VReg_512RegClassID).contains(reg)) { - Type = 'v'; - NumRegs = 16; - } else if (MRI.getRegClass(AMDGPU::SReg_512RegClassID).contains(reg)) { - Type = 's'; - NumRegs = 16; - } else { - O << getRegisterName(reg); - return; - } - - // The low 8 bits of the encoding value is the register index, for both VGPRs - // and SGPRs. - unsigned RegIdx = MRI.getEncodingValue(reg) & ((1 << 8) - 1); - if (NumRegs == 1) { - O << Type << RegIdx; - return; - } - - O << Type << '[' << RegIdx << ':' << (RegIdx + NumRegs - 1) << ']'; -} - -void AMDGPUInstPrinter::printVOPDst(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { - if (MII.get(MI->getOpcode()).TSFlags & SIInstrFlags::VOP3) - O << "_e64 "; - else - O << "_e32 "; - - printOperand(MI, OpNo, O); -} - -void AMDGPUInstPrinter::printImmediate32(uint32_t Imm, raw_ostream &O) { - int32_t SImm = static_cast(Imm); - if (SImm >= -16 && SImm <= 64) { - O << SImm; - return; - } - - if (Imm == FloatToBits(0.0f)) - O << "0.0"; - else if (Imm == FloatToBits(1.0f)) - O << "1.0"; - else if (Imm == FloatToBits(-1.0f)) - O << "-1.0"; - else if (Imm == FloatToBits(0.5f)) - O << "0.5"; - else if (Imm == FloatToBits(-0.5f)) - O << "-0.5"; - else if (Imm == FloatToBits(2.0f)) - O << "2.0"; - else if (Imm == FloatToBits(-2.0f)) - O << "-2.0"; - else if (Imm == FloatToBits(4.0f)) - O << "4.0"; - else if (Imm == FloatToBits(-4.0f)) - O << "-4.0"; - else - O << formatHex(static_cast(Imm)); -} - -void AMDGPUInstPrinter::printImmediate64(uint64_t Imm, raw_ostream &O) { - int64_t SImm = static_cast(Imm); - if (SImm >= -16 && SImm <= 64) { - O << SImm; - return; - } - - if (Imm == DoubleToBits(0.0)) - O << "0.0"; - else if (Imm == DoubleToBits(1.0)) - O << "1.0"; - else if (Imm == DoubleToBits(-1.0)) - O << "-1.0"; - else if (Imm == DoubleToBits(0.5)) - O << "0.5"; - else if (Imm == DoubleToBits(-0.5)) - O << "-0.5"; - else if (Imm == DoubleToBits(2.0)) - O << "2.0"; - else if (Imm == DoubleToBits(-2.0)) - O << "-2.0"; - else if (Imm == DoubleToBits(4.0)) - O << "4.0"; - else if (Imm == DoubleToBits(-4.0)) - O << "-4.0"; - else - llvm_unreachable("64-bit literal constants not supported"); -} - -void AMDGPUInstPrinter::printOperand(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { - - const MCOperand &Op = MI->getOperand(OpNo); - if (Op.isReg()) { - switch (Op.getReg()) { - // This is the default predicate state, so we don't need to print it. - case AMDGPU::PRED_SEL_OFF: - break; - - default: - printRegOperand(Op.getReg(), O, MRI); - break; - } - } else if (Op.isImm()) { - const MCInstrDesc &Desc = MII.get(MI->getOpcode()); - int RCID = Desc.OpInfo[OpNo].RegClass; - if (RCID != -1) { - const MCRegisterClass &ImmRC = MRI.getRegClass(RCID); - if (ImmRC.getSize() == 4) - printImmediate32(Op.getImm(), O); - else if (ImmRC.getSize() == 8) - printImmediate64(Op.getImm(), O); - else - llvm_unreachable("Invalid register class size"); - } else if (Desc.OpInfo[OpNo].OperandType == MCOI::OPERAND_IMMEDIATE) { - printImmediate32(Op.getImm(), O); - } else { - // We hit this for the immediate instruction bits that don't yet have a - // custom printer. - // TODO: Eventually this should be unnecessary. - O << formatDec(Op.getImm()); - } - } else if (Op.isFPImm()) { - // We special case 0.0 because otherwise it will be printed as an integer. - if (Op.getFPImm() == 0.0) - O << "0.0"; - else { - const MCInstrDesc &Desc = MII.get(MI->getOpcode()); - const MCRegisterClass &ImmRC = MRI.getRegClass(Desc.OpInfo[OpNo].RegClass); - - if (ImmRC.getSize() == 4) - printImmediate32(FloatToBits(Op.getFPImm()), O); - else if (ImmRC.getSize() == 8) - printImmediate64(DoubleToBits(Op.getFPImm()), O); - else - llvm_unreachable("Invalid register class size"); - } - } else if (Op.isExpr()) { - const MCExpr *Exp = Op.getExpr(); - Exp->print(O, &MAI); - } else { - llvm_unreachable("unknown operand type in printOperand"); - } -} - -void AMDGPUInstPrinter::printOperandAndMods(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { - unsigned InputModifiers = MI->getOperand(OpNo).getImm(); - if (InputModifiers & SISrcMods::NEG) - O << '-'; - if (InputModifiers & SISrcMods::ABS) - O << '|'; - printOperand(MI, OpNo + 1, O); - if (InputModifiers & SISrcMods::ABS) - O << '|'; -} - -void AMDGPUInstPrinter::printInterpSlot(const MCInst *MI, unsigned OpNum, - raw_ostream &O) { - unsigned Imm = MI->getOperand(OpNum).getImm(); - - if (Imm == 2) { - O << "P0"; - } else if (Imm == 1) { - O << "P20"; - } else if (Imm == 0) { - O << "P10"; - } else { - llvm_unreachable("Invalid interpolation parameter slot"); - } -} - -void AMDGPUInstPrinter::printMemOperand(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { - printOperand(MI, OpNo, O); - O << ", "; - printOperand(MI, OpNo + 1, O); -} - -void AMDGPUInstPrinter::printIfSet(const MCInst *MI, unsigned OpNo, - raw_ostream &O, StringRef Asm, - StringRef Default) { - const MCOperand &Op = MI->getOperand(OpNo); - assert(Op.isImm()); - if (Op.getImm() == 1) { - O << Asm; - } else { - O << Default; - } -} - -void AMDGPUInstPrinter::printAbs(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { - printIfSet(MI, OpNo, O, "|"); -} - -void AMDGPUInstPrinter::printClamp(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { - printIfSet(MI, OpNo, O, "_SAT"); -} - -void AMDGPUInstPrinter::printClampSI(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { - if (MI->getOperand(OpNo).getImm()) - O << " clamp"; -} - -void AMDGPUInstPrinter::printOModSI(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { - int Imm = MI->getOperand(OpNo).getImm(); - if (Imm == SIOutMods::MUL2) - O << " mul:2"; - else if (Imm == SIOutMods::MUL4) - O << " mul:4"; - else if (Imm == SIOutMods::DIV2) - O << " div:2"; -} - -void AMDGPUInstPrinter::printLiteral(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { - int32_t Imm = MI->getOperand(OpNo).getImm(); - O << Imm << '(' << BitsToFloat(Imm) << ')'; -} - -void AMDGPUInstPrinter::printLast(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { - printIfSet(MI, OpNo, O.indent(25 - O.GetNumBytesInBuffer()), "*", " "); -} - -void AMDGPUInstPrinter::printNeg(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { - printIfSet(MI, OpNo, O, "-"); -} - -void AMDGPUInstPrinter::printOMOD(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { - switch (MI->getOperand(OpNo).getImm()) { - default: break; - case 1: - O << " * 2.0"; - break; - case 2: - O << " * 4.0"; - break; - case 3: - O << " / 2.0"; - break; - } -} - -void AMDGPUInstPrinter::printRel(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { - printIfSet(MI, OpNo, O, "+"); -} - -void AMDGPUInstPrinter::printUpdateExecMask(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { - printIfSet(MI, OpNo, O, "ExecMask,"); -} - -void AMDGPUInstPrinter::printUpdatePred(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { - printIfSet(MI, OpNo, O, "Pred,"); -} - -void AMDGPUInstPrinter::printWrite(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { - const MCOperand &Op = MI->getOperand(OpNo); - if (Op.getImm() == 0) { - O << " (MASKED)"; - } -} - -void AMDGPUInstPrinter::printSel(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { - const char * chans = "XYZW"; - int sel = MI->getOperand(OpNo).getImm(); - - int chan = sel & 3; - sel >>= 2; - - if (sel >= 512) { - sel -= 512; - int cb = sel >> 12; - sel &= 4095; - O << cb << '[' << sel << ']'; - } else if (sel >= 448) { - sel -= 448; - O << sel; - } else if (sel >= 0){ - O << sel; - } - - if (sel >= 0) - O << '.' << chans[chan]; -} - -void AMDGPUInstPrinter::printBankSwizzle(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { - int BankSwizzle = MI->getOperand(OpNo).getImm(); - switch (BankSwizzle) { - case 1: - O << "BS:VEC_021/SCL_122"; - break; - case 2: - O << "BS:VEC_120/SCL_212"; - break; - case 3: - O << "BS:VEC_102/SCL_221"; - break; - case 4: - O << "BS:VEC_201"; - break; - case 5: - O << "BS:VEC_210"; - break; - default: - break; - } - return; -} - -void AMDGPUInstPrinter::printRSel(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { - unsigned Sel = MI->getOperand(OpNo).getImm(); - switch (Sel) { - case 0: - O << 'X'; - break; - case 1: - O << 'Y'; - break; - case 2: - O << 'Z'; - break; - case 3: - O << 'W'; - break; - case 4: - O << '0'; - break; - case 5: - O << '1'; - break; - case 7: - O << '_'; - break; - default: - break; - } -} - -void AMDGPUInstPrinter::printCT(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { - unsigned CT = MI->getOperand(OpNo).getImm(); - switch (CT) { - case 0: - O << 'U'; - break; - case 1: - O << 'N'; - break; - default: - break; - } -} - -void AMDGPUInstPrinter::printKCache(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { - int KCacheMode = MI->getOperand(OpNo).getImm(); - if (KCacheMode > 0) { - int KCacheBank = MI->getOperand(OpNo - 2).getImm(); - O << "CB" << KCacheBank << ':'; - int KCacheAddr = MI->getOperand(OpNo + 2).getImm(); - int LineSize = (KCacheMode == 1) ? 16 : 32; - O << KCacheAddr * 16 << '-' << KCacheAddr * 16 + LineSize; - } -} - -void AMDGPUInstPrinter::printSendMsg(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { - unsigned SImm16 = MI->getOperand(OpNo).getImm(); - unsigned Msg = SImm16 & 0xF; - if (Msg == 2 || Msg == 3) { - unsigned Op = (SImm16 >> 4) & 0xF; - if (Msg == 3) - O << "Gs_done("; - else - O << "Gs("; - if (Op == 0) { - O << "nop"; - } else { - unsigned Stream = (SImm16 >> 8) & 0x3; - if (Op == 1) - O << "cut"; - else if (Op == 2) - O << "emit"; - else if (Op == 3) - O << "emit-cut"; - O << " stream " << Stream; - } - O << "), [m0] "; - } else if (Msg == 1) - O << "interrupt "; - else if (Msg == 15) - O << "system "; - else - O << "unknown(" << Msg << ") "; -} - -void AMDGPUInstPrinter::printWaitFlag(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { - // Note: Mask values are taken from SIInsertWaits.cpp and not from ISA docs - // SIInsertWaits.cpp bits usage does not match ISA docs description but it - // works so it might be a misprint in docs. - unsigned SImm16 = MI->getOperand(OpNo).getImm(); - unsigned Vmcnt = SImm16 & 0xF; - unsigned Expcnt = (SImm16 >> 4) & 0xF; - unsigned Lgkmcnt = (SImm16 >> 8) & 0xF; - - bool NeedSpace = false; - - if (Vmcnt != 0xF) { - O << "vmcnt(" << Vmcnt << ')'; - NeedSpace = true; - } - - if (Expcnt != 0x7) { - if (NeedSpace) - O << ' '; - O << "expcnt(" << Expcnt << ')'; - NeedSpace = true; - } - - if (Lgkmcnt != 0x7) { - if (NeedSpace) - O << ' '; - O << "lgkmcnt(" << Lgkmcnt << ')'; - } -} - -#include "AMDGPUGenAsmWriter.inc" Index: projects/clang370-import/contrib/llvm/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.h =================================================================== --- projects/clang370-import/contrib/llvm/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.h (revision 288077) +++ projects/clang370-import/contrib/llvm/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.h (nonexistent) @@ -1,88 +0,0 @@ -//===-- AMDGPUInstPrinter.h - AMDGPU MC Inst -> ASM interface ---*- C++ -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -/// \file -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIB_TARGET_R600_INSTPRINTER_AMDGPUINSTPRINTER_H -#define LLVM_LIB_TARGET_R600_INSTPRINTER_AMDGPUINSTPRINTER_H - -#include "llvm/ADT/StringRef.h" -#include "llvm/MC/MCInstPrinter.h" -#include "llvm/Support/raw_ostream.h" - -namespace llvm { - -class AMDGPUInstPrinter : public MCInstPrinter { -public: - AMDGPUInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII, - const MCRegisterInfo &MRI) - : MCInstPrinter(MAI, MII, MRI) {} - - //Autogenerated by tblgen - void printInstruction(const MCInst *MI, raw_ostream &O); - static const char *getRegisterName(unsigned RegNo); - - void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot, - const MCSubtargetInfo &STI) override; - static void printRegOperand(unsigned RegNo, raw_ostream &O, - const MCRegisterInfo &MRI); - -private: - void printU8ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); - void printU16ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); - void printU8ImmDecOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); - void printU16ImmDecOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); - void printU32ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); - void printOffen(const MCInst *MI, unsigned OpNo, raw_ostream &O); - void printIdxen(const MCInst *MI, unsigned OpNo, raw_ostream &O); - void printAddr64(const MCInst *MI, unsigned OpNo, raw_ostream &O); - void printMBUFOffset(const MCInst *MI, unsigned OpNo, raw_ostream &O); - void printDSOffset(const MCInst *MI, unsigned OpNo, raw_ostream &O); - void printDSOffset0(const MCInst *MI, unsigned OpNo, raw_ostream &O); - void printDSOffset1(const MCInst *MI, unsigned OpNo, raw_ostream &O); - void printGDS(const MCInst *MI, unsigned OpNo, raw_ostream &O); - void printGLC(const MCInst *MI, unsigned OpNo, raw_ostream &O); - void printSLC(const MCInst *MI, unsigned OpNo, raw_ostream &O); - void printTFE(const MCInst *MI, unsigned OpNo, raw_ostream &O); - void printRegOperand(unsigned RegNo, raw_ostream &O); - void printVOPDst(const MCInst *MI, unsigned OpNo, raw_ostream &O); - void printImmediate32(uint32_t I, raw_ostream &O); - void printImmediate64(uint64_t I, raw_ostream &O); - void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); - void printOperandAndMods(const MCInst *MI, unsigned OpNo, raw_ostream &O); - static void printInterpSlot(const MCInst *MI, unsigned OpNum, raw_ostream &O); - void printMemOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); - static void printIfSet(const MCInst *MI, unsigned OpNo, raw_ostream &O, - StringRef Asm, StringRef Default = ""); - static void printAbs(const MCInst *MI, unsigned OpNo, raw_ostream &O); - static void printClamp(const MCInst *MI, unsigned OpNo, raw_ostream &O); - static void printClampSI(const MCInst *MI, unsigned OpNo, raw_ostream &O); - static void printOModSI(const MCInst *MI, unsigned OpNo, raw_ostream &O); - static void printLiteral(const MCInst *MI, unsigned OpNo, raw_ostream &O); - static void printLast(const MCInst *MI, unsigned OpNo, raw_ostream &O); - static void printNeg(const MCInst *MI, unsigned OpNo, raw_ostream &O); - static void printOMOD(const MCInst *MI, unsigned OpNo, raw_ostream &O); - static void printRel(const MCInst *MI, unsigned OpNo, raw_ostream &O); - static void printUpdateExecMask(const MCInst *MI, unsigned OpNo, - raw_ostream &O); - static void printUpdatePred(const MCInst *MI, unsigned OpNo, raw_ostream &O); - static void printWrite(const MCInst *MI, unsigned OpNo, raw_ostream &O); - static void printSel(const MCInst *MI, unsigned OpNo, raw_ostream &O); - static void printBankSwizzle(const MCInst *MI, unsigned OpNo, raw_ostream &O); - static void printRSel(const MCInst *MI, unsigned OpNo, raw_ostream &O); - static void printCT(const MCInst *MI, unsigned OpNo, raw_ostream &O); - static void printKCache(const MCInst *MI, unsigned OpNo, raw_ostream &O); - static void printSendMsg(const MCInst *MI, unsigned OpNo, raw_ostream &O); - static void printWaitFlag(const MCInst *MI, unsigned OpNo, raw_ostream &O); -}; - -} // End namespace llvm - -#endif Index: projects/clang370-import/contrib/llvm/lib/Target/R600/R600MachineFunctionInfo.cpp =================================================================== --- projects/clang370-import/contrib/llvm/lib/Target/R600/R600MachineFunctionInfo.cpp (revision 288077) +++ projects/clang370-import/contrib/llvm/lib/Target/R600/R600MachineFunctionInfo.cpp (nonexistent) @@ -1,20 +0,0 @@ -//===-- R600MachineFunctionInfo.cpp - R600 Machine Function Info-*- C++ -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -/// \file -//===----------------------------------------------------------------------===// - -#include "R600MachineFunctionInfo.h" - -using namespace llvm; - - -// Pin the vtable to this file. -void R600MachineFunctionInfo::anchor() {} - -R600MachineFunctionInfo::R600MachineFunctionInfo(const MachineFunction &MF) - : AMDGPUMachineFunction(MF) { } Index: projects/clang370-import/contrib/llvm/lib/Target/R600/R600Schedule.td =================================================================== --- projects/clang370-import/contrib/llvm/lib/Target/R600/R600Schedule.td (revision 288077) +++ projects/clang370-import/contrib/llvm/lib/Target/R600/R600Schedule.td (nonexistent) @@ -1,49 +0,0 @@ -//===-- R600Schedule.td - R600 Scheduling definitions ------*- tablegen -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// R600 has a VLIW architecture. On pre-cayman cards there are 5 instruction -// slots ALU.X, ALU.Y, ALU.Z, ALU.W, and TRANS. For cayman cards, the TRANS -// slot has been removed. -// -//===----------------------------------------------------------------------===// - - -def ALU_X : FuncUnit; -def ALU_Y : FuncUnit; -def ALU_Z : FuncUnit; -def ALU_W : FuncUnit; -def TRANS : FuncUnit; - -def AnyALU : InstrItinClass; -def VecALU : InstrItinClass; -def TransALU : InstrItinClass; -def XALU : InstrItinClass; - -def R600_VLIW5_Itin : ProcessorItineraries < - [ALU_X, ALU_Y, ALU_Z, ALU_W, TRANS, ALU_NULL], - [], - [ - InstrItinData]>, - InstrItinData]>, - InstrItinData]>, - InstrItinData]>, - InstrItinData]> - ] ->; - -def R600_VLIW4_Itin : ProcessorItineraries < - [ALU_X, ALU_Y, ALU_Z, ALU_W, ALU_NULL], - [], - [ - InstrItinData]>, - InstrItinData]>, - InstrItinData]>, - InstrItinData]> - ] ->; Index: projects/clang370-import/contrib/llvm/lib/Target/R600/AMDGPUSubtarget.cpp =================================================================== --- projects/clang370-import/contrib/llvm/lib/Target/R600/AMDGPUSubtarget.cpp (revision 288077) +++ projects/clang370-import/contrib/llvm/lib/Target/R600/AMDGPUSubtarget.cpp (nonexistent) @@ -1,134 +0,0 @@ -//===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -/// \file -/// \brief Implements the AMDGPU specific subclass of TargetSubtarget. -// -//===----------------------------------------------------------------------===// - -#include "AMDGPUSubtarget.h" -#include "R600ISelLowering.h" -#include "R600InstrInfo.h" -#include "R600MachineScheduler.h" -#include "SIISelLowering.h" -#include "SIInstrInfo.h" -#include "SIMachineFunctionInfo.h" -#include "llvm/ADT/SmallString.h" -#include "llvm/CodeGen/MachineScheduler.h" - -using namespace llvm; - -#define DEBUG_TYPE "amdgpu-subtarget" - -#define GET_SUBTARGETINFO_ENUM -#define GET_SUBTARGETINFO_TARGET_DESC -#define GET_SUBTARGETINFO_CTOR -#include "AMDGPUGenSubtargetInfo.inc" - -AMDGPUSubtarget & -AMDGPUSubtarget::initializeSubtargetDependencies(StringRef TT, StringRef GPU, - StringRef FS) { - // Determine default and user-specified characteristics - // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be - // enabled, but some instructions do not respect them and they run at the - // double precision rate, so don't enable by default. - // - // We want to be able to turn these off, but making this a subtarget feature - // for SI has the unhelpful behavior that it unsets everything else if you - // disable it. - - SmallString<256> FullFS("+promote-alloca,+fp64-denormals,"); - FullFS += FS; - - if (GPU == "" && Triple(TT).getArch() == Triple::amdgcn) - GPU = "SI"; - - ParseSubtargetFeatures(GPU, FullFS); - - // FIXME: I don't think think Evergreen has any useful support for - // denormals, but should be checked. Should we issue a warning somewhere - // if someone tries to enable these? - if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) { - FP32Denormals = false; - FP64Denormals = false; - } - return *this; -} - -AMDGPUSubtarget::AMDGPUSubtarget(StringRef TT, StringRef GPU, StringRef FS, - TargetMachine &TM) - : AMDGPUGenSubtargetInfo(TT, GPU, FS), DevName(GPU), Is64bit(false), - DumpCode(false), R600ALUInst(false), HasVertexCache(false), - TexVTXClauseSize(0), Gen(AMDGPUSubtarget::R600), FP64(false), - FP64Denormals(false), FP32Denormals(false), FastFMAF32(false), - CaymanISA(false), FlatAddressSpace(false), EnableIRStructurizer(true), - EnablePromoteAlloca(false), EnableIfCvt(true), EnableLoadStoreOpt(false), - WavefrontSize(0), CFALUBug(false), LocalMemorySize(0), - EnableVGPRSpilling(false), SGPRInitBug(false), - IsGCN(false), GCN1Encoding(false), GCN3Encoding(false), CIInsts(false), - LDSBankCount(0), - FrameLowering(TargetFrameLowering::StackGrowsUp, - 64 * 16, // Maximum stack alignment (long16) - 0), - InstrItins(getInstrItineraryForCPU(GPU)), TargetTriple(TT) { - - initializeSubtargetDependencies(TT, GPU, FS); - - if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) { - InstrInfo.reset(new R600InstrInfo(*this)); - TLInfo.reset(new R600TargetLowering(TM, *this)); - } else { - InstrInfo.reset(new SIInstrInfo(*this)); - TLInfo.reset(new SITargetLowering(TM, *this)); - } -} - -unsigned AMDGPUSubtarget::getStackEntrySize() const { - assert(getGeneration() <= NORTHERN_ISLANDS); - switch(getWavefrontSize()) { - case 16: - return 8; - case 32: - return hasCaymanISA() ? 4 : 8; - case 64: - return 4; - default: - llvm_unreachable("Illegal wavefront size."); - } -} - -unsigned AMDGPUSubtarget::getAmdKernelCodeChipID() const { - switch(getGeneration()) { - default: llvm_unreachable("ChipID unknown"); - case SEA_ISLANDS: return 12; - } -} - -bool AMDGPUSubtarget::isVGPRSpillingEnabled( - const SIMachineFunctionInfo *MFI) const { - return MFI->getShaderType() == ShaderType::COMPUTE || EnableVGPRSpilling; -} - -void AMDGPUSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, - MachineInstr *begin, - MachineInstr *end, - unsigned NumRegionInstrs) const { - if (getGeneration() >= SOUTHERN_ISLANDS) { - - // Track register pressure so the scheduler can try to decrease - // pressure once register usage is above the threshold defined by - // SIRegisterInfo::getRegPressureSetLimit() - Policy.ShouldTrackPressure = true; - - // Enabling both top down and bottom up scheduling seems to give us less - // register spills than just using one of these approaches on its own. - Policy.OnlyTopDown = false; - Policy.OnlyBottomUp = false; - } -} Index: projects/clang370-import/contrib/llvm/lib/Target/R600/SIISelLowering.h =================================================================== --- projects/clang370-import/contrib/llvm/lib/Target/R600/SIISelLowering.h (revision 288077) +++ projects/clang370-import/contrib/llvm/lib/Target/R600/SIISelLowering.h (nonexistent) @@ -1,125 +0,0 @@ -//===-- SIISelLowering.h - SI DAG Lowering Interface ------------*- C++ -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -/// \file -/// \brief SI DAG Lowering interface definition -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIB_TARGET_R600_SIISELLOWERING_H -#define LLVM_LIB_TARGET_R600_SIISELLOWERING_H - -#include "AMDGPUISelLowering.h" -#include "SIInstrInfo.h" - -namespace llvm { - -class SITargetLowering : public AMDGPUTargetLowering { - SDValue LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT, SDLoc DL, - SDValue Chain, unsigned Offset, bool Signed) const; - SDValue LowerSampleIntrinsic(unsigned Opcode, const SDValue &Op, - SelectionDAG &DAG) const; - SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, - SelectionDAG &DAG) const override; - - SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerINTRINSIC_VOID(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerFastFDIV(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerFDIV32(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerFDIV64(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerFDIV(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG, bool Signed) const; - SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerTrig(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const; - - void adjustWritemask(MachineSDNode *&N, SelectionDAG &DAG) const; - - SDValue performUCharToFloatCombine(SDNode *N, - DAGCombinerInfo &DCI) const; - SDValue performSHLPtrCombine(SDNode *N, - unsigned AS, - DAGCombinerInfo &DCI) const; - SDValue performAndCombine(SDNode *N, DAGCombinerInfo &DCI) const; - SDValue performOrCombine(SDNode *N, DAGCombinerInfo &DCI) const; - SDValue performClassCombine(SDNode *N, DAGCombinerInfo &DCI) const; - - SDValue performMin3Max3Combine(SDNode *N, DAGCombinerInfo &DCI) const; - SDValue performSetCCCombine(SDNode *N, DAGCombinerInfo &DCI) const; - -public: - SITargetLowering(TargetMachine &tm, const AMDGPUSubtarget &STI); - - bool isShuffleMaskLegal(const SmallVectorImpl &/*Mask*/, - EVT /*VT*/) const override; - - bool isLegalAddressingMode(const AddrMode &AM, - Type *Ty, unsigned AS) const override; - - bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AS, - unsigned Align, - bool *IsFast) const override; - - EVT getOptimalMemOpType(uint64_t Size, unsigned DstAlign, - unsigned SrcAlign, bool IsMemset, - bool ZeroMemset, - bool MemcpyStrSrc, - MachineFunction &MF) const override; - - TargetLoweringBase::LegalizeTypeAction - getPreferredVectorAction(EVT VT) const override; - - bool shouldConvertConstantLoadToIntImm(const APInt &Imm, - Type *Ty) const override; - - SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, - bool isVarArg, - const SmallVectorImpl &Ins, - SDLoc DL, SelectionDAG &DAG, - SmallVectorImpl &InVals) const override; - - MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr * MI, - MachineBasicBlock * BB) const override; - bool enableAggressiveFMAFusion(EVT VT) const override; - EVT getSetCCResultType(LLVMContext &Context, EVT VT) const override; - MVT getScalarShiftAmountTy(EVT VT) const override; - bool isFMAFasterThanFMulAndFAdd(EVT VT) const override; - SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override; - SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override; - SDNode *PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const override; - void AdjustInstrPostInstrSelection(MachineInstr *MI, - SDNode *Node) const override; - - int32_t analyzeImmediate(const SDNode *N) const; - SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC, - unsigned Reg, EVT VT) const override; - void legalizeTargetIndependentNode(SDNode *Node, SelectionDAG &DAG) const; - - MachineSDNode *wrapAddr64Rsrc(SelectionDAG &DAG, SDLoc DL, SDValue Ptr) const; - MachineSDNode *buildRSRC(SelectionDAG &DAG, - SDLoc DL, - SDValue Ptr, - uint32_t RsrcDword1, - uint64_t RsrcDword2And3) const; - MachineSDNode *buildScratchRSRC(SelectionDAG &DAG, - SDLoc DL, - SDValue Ptr) const; - - std::pair getRegForInlineAsmConstraint( - const TargetRegisterInfo *TRI, - const std::string &Constraint, MVT VT) const override; - SDValue copyToM0(SelectionDAG &DAG, SDValue Chain, SDLoc DL, SDValue V) const; -}; - -} // End namespace llvm - -#endif Index: projects/clang370-import/contrib/llvm/lib/Target/R600/AMDGPUISelLowering.cpp =================================================================== --- projects/clang370-import/contrib/llvm/lib/Target/R600/AMDGPUISelLowering.cpp (revision 288077) +++ projects/clang370-import/contrib/llvm/lib/Target/R600/AMDGPUISelLowering.cpp (nonexistent) @@ -1,2866 +0,0 @@ -//===-- AMDGPUISelLowering.cpp - AMDGPU Common DAG lowering functions -----===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -/// \file -/// \brief This is the parent TargetLowering class for hardware code gen -/// targets. -// -//===----------------------------------------------------------------------===// - -#include "AMDGPUISelLowering.h" -#include "AMDGPU.h" -#include "AMDGPUFrameLowering.h" -#include "AMDGPUIntrinsicInfo.h" -#include "AMDGPURegisterInfo.h" -#include "AMDGPUSubtarget.h" -#include "R600MachineFunctionInfo.h" -#include "SIMachineFunctionInfo.h" -#include "llvm/CodeGen/CallingConvLower.h" -#include "llvm/CodeGen/MachineFunction.h" -#include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/CodeGen/SelectionDAG.h" -#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" -#include "llvm/IR/DataLayout.h" -#include "llvm/IR/DiagnosticInfo.h" -#include "llvm/IR/DiagnosticPrinter.h" - -using namespace llvm; - -namespace { - -/// Diagnostic information for unimplemented or unsupported feature reporting. -class DiagnosticInfoUnsupported : public DiagnosticInfo { -private: - const Twine &Description; - const Function &Fn; - - static int KindID; - - static int getKindID() { - if (KindID == 0) - KindID = llvm::getNextAvailablePluginDiagnosticKind(); - return KindID; - } - -public: - DiagnosticInfoUnsupported(const Function &Fn, const Twine &Desc, - DiagnosticSeverity Severity = DS_Error) - : DiagnosticInfo(getKindID(), Severity), - Description(Desc), - Fn(Fn) { } - - const Function &getFunction() const { return Fn; } - const Twine &getDescription() const { return Description; } - - void print(DiagnosticPrinter &DP) const override { - DP << "unsupported " << getDescription() << " in " << Fn.getName(); - } - - static bool classof(const DiagnosticInfo *DI) { - return DI->getKind() == getKindID(); - } -}; - -int DiagnosticInfoUnsupported::KindID = 0; -} - - -static bool allocateStack(unsigned ValNo, MVT ValVT, MVT LocVT, - CCValAssign::LocInfo LocInfo, - ISD::ArgFlagsTy ArgFlags, CCState &State) { - unsigned Offset = State.AllocateStack(ValVT.getStoreSize(), - ArgFlags.getOrigAlign()); - State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo)); - - return true; -} - -#include "AMDGPUGenCallingConv.inc" - -// Find a larger type to do a load / store of a vector with. -EVT AMDGPUTargetLowering::getEquivalentMemType(LLVMContext &Ctx, EVT VT) { - unsigned StoreSize = VT.getStoreSizeInBits(); - if (StoreSize <= 32) - return EVT::getIntegerVT(Ctx, StoreSize); - - assert(StoreSize % 32 == 0 && "Store size not a multiple of 32"); - return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32); -} - -// Type for a vector that will be loaded to. -EVT AMDGPUTargetLowering::getEquivalentLoadRegType(LLVMContext &Ctx, EVT VT) { - unsigned StoreSize = VT.getStoreSizeInBits(); - if (StoreSize <= 32) - return EVT::getIntegerVT(Ctx, 32); - - return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32); -} - -AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM, - const AMDGPUSubtarget &STI) - : TargetLowering(TM), Subtarget(&STI) { - setOperationAction(ISD::Constant, MVT::i32, Legal); - setOperationAction(ISD::Constant, MVT::i64, Legal); - setOperationAction(ISD::ConstantFP, MVT::f32, Legal); - setOperationAction(ISD::ConstantFP, MVT::f64, Legal); - - setOperationAction(ISD::BR_JT, MVT::Other, Expand); - setOperationAction(ISD::BRIND, MVT::Other, Expand); - - // We need to custom lower some of the intrinsics - setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); - - // Library functions. These default to Expand, but we have instructions - // for them. - setOperationAction(ISD::FCEIL, MVT::f32, Legal); - setOperationAction(ISD::FEXP2, MVT::f32, Legal); - setOperationAction(ISD::FPOW, MVT::f32, Legal); - setOperationAction(ISD::FLOG2, MVT::f32, Legal); - setOperationAction(ISD::FABS, MVT::f32, Legal); - setOperationAction(ISD::FFLOOR, MVT::f32, Legal); - setOperationAction(ISD::FRINT, MVT::f32, Legal); - setOperationAction(ISD::FTRUNC, MVT::f32, Legal); - setOperationAction(ISD::FMINNUM, MVT::f32, Legal); - setOperationAction(ISD::FMAXNUM, MVT::f32, Legal); - - setOperationAction(ISD::FROUND, MVT::f32, Custom); - setOperationAction(ISD::FROUND, MVT::f64, Custom); - - setOperationAction(ISD::FREM, MVT::f32, Custom); - setOperationAction(ISD::FREM, MVT::f64, Custom); - - // v_mad_f32 does not support denormals according to some sources. - if (!Subtarget->hasFP32Denormals()) - setOperationAction(ISD::FMAD, MVT::f32, Legal); - - // Expand to fneg + fadd. - setOperationAction(ISD::FSUB, MVT::f64, Expand); - - // Lower floating point store/load to integer store/load to reduce the number - // of patterns in tablegen. - setOperationAction(ISD::STORE, MVT::f32, Promote); - AddPromotedToType(ISD::STORE, MVT::f32, MVT::i32); - - setOperationAction(ISD::STORE, MVT::v2f32, Promote); - AddPromotedToType(ISD::STORE, MVT::v2f32, MVT::v2i32); - - setOperationAction(ISD::STORE, MVT::v4f32, Promote); - AddPromotedToType(ISD::STORE, MVT::v4f32, MVT::v4i32); - - setOperationAction(ISD::STORE, MVT::v8f32, Promote); - AddPromotedToType(ISD::STORE, MVT::v8f32, MVT::v8i32); - - setOperationAction(ISD::STORE, MVT::v16f32, Promote); - AddPromotedToType(ISD::STORE, MVT::v16f32, MVT::v16i32); - - setOperationAction(ISD::STORE, MVT::f64, Promote); - AddPromotedToType(ISD::STORE, MVT::f64, MVT::i64); - - setOperationAction(ISD::STORE, MVT::v2f64, Promote); - AddPromotedToType(ISD::STORE, MVT::v2f64, MVT::v2i64); - - // Custom lowering of vector stores is required for local address space - // stores. - setOperationAction(ISD::STORE, MVT::v4i32, Custom); - - setTruncStoreAction(MVT::v2i32, MVT::v2i16, Custom); - setTruncStoreAction(MVT::v2i32, MVT::v2i8, Custom); - setTruncStoreAction(MVT::v4i32, MVT::v4i8, Custom); - - // XXX: This can be change to Custom, once ExpandVectorStores can - // handle 64-bit stores. - setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand); - - setTruncStoreAction(MVT::i64, MVT::i16, Expand); - setTruncStoreAction(MVT::i64, MVT::i8, Expand); - setTruncStoreAction(MVT::i64, MVT::i1, Expand); - setTruncStoreAction(MVT::v2i64, MVT::v2i1, Expand); - setTruncStoreAction(MVT::v4i64, MVT::v4i1, Expand); - - - setOperationAction(ISD::LOAD, MVT::f32, Promote); - AddPromotedToType(ISD::LOAD, MVT::f32, MVT::i32); - - setOperationAction(ISD::LOAD, MVT::v2f32, Promote); - AddPromotedToType(ISD::LOAD, MVT::v2f32, MVT::v2i32); - - setOperationAction(ISD::LOAD, MVT::v4f32, Promote); - AddPromotedToType(ISD::LOAD, MVT::v4f32, MVT::v4i32); - - setOperationAction(ISD::LOAD, MVT::v8f32, Promote); - AddPromotedToType(ISD::LOAD, MVT::v8f32, MVT::v8i32); - - setOperationAction(ISD::LOAD, MVT::v16f32, Promote); - AddPromotedToType(ISD::LOAD, MVT::v16f32, MVT::v16i32); - - setOperationAction(ISD::LOAD, MVT::f64, Promote); - AddPromotedToType(ISD::LOAD, MVT::f64, MVT::i64); - - setOperationAction(ISD::LOAD, MVT::v2f64, Promote); - AddPromotedToType(ISD::LOAD, MVT::v2f64, MVT::v2i64); - - setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i32, Custom); - setOperationAction(ISD::CONCAT_VECTORS, MVT::v4f32, Custom); - setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i32, Custom); - setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f32, Custom); - setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2f32, Custom); - setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2i32, Custom); - setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4f32, Custom); - setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4i32, Custom); - setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8f32, Custom); - setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8i32, Custom); - - // There are no 64-bit extloads. These should be done as a 32-bit extload and - // an extension to 64-bit. - for (MVT VT : MVT::integer_valuetypes()) { - setLoadExtAction(ISD::EXTLOAD, MVT::i64, VT, Expand); - setLoadExtAction(ISD::SEXTLOAD, MVT::i64, VT, Expand); - setLoadExtAction(ISD::ZEXTLOAD, MVT::i64, VT, Expand); - } - - for (MVT VT : MVT::integer_vector_valuetypes()) { - setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Expand); - setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Expand); - setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v2i8, Expand); - setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i8, Expand); - setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i8, Expand); - setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v4i8, Expand); - setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Expand); - setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Expand); - setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v2i16, Expand); - setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i16, Expand); - setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i16, Expand); - setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v4i16, Expand); - } - - setOperationAction(ISD::BR_CC, MVT::i1, Expand); - - if (Subtarget->getGeneration() < AMDGPUSubtarget::SEA_ISLANDS) { - setOperationAction(ISD::FCEIL, MVT::f64, Custom); - setOperationAction(ISD::FTRUNC, MVT::f64, Custom); - setOperationAction(ISD::FRINT, MVT::f64, Custom); - setOperationAction(ISD::FFLOOR, MVT::f64, Custom); - } - - if (!Subtarget->hasBFI()) { - // fcopysign can be done in a single instruction with BFI. - setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand); - setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); - } - - setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand); - - setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand); - setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand); - setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Expand); - setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Expand); - - setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand); - setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand); - setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand); - setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Expand); - - setTruncStoreAction(MVT::f32, MVT::f16, Expand); - setTruncStoreAction(MVT::v2f32, MVT::v2f16, Expand); - setTruncStoreAction(MVT::v4f32, MVT::v4f16, Expand); - setTruncStoreAction(MVT::v8f32, MVT::v8f16, Expand); - - setTruncStoreAction(MVT::f64, MVT::f16, Expand); - setTruncStoreAction(MVT::f64, MVT::f32, Expand); - - const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 }; - for (MVT VT : ScalarIntVTs) { - setOperationAction(ISD::SREM, VT, Expand); - setOperationAction(ISD::SDIV, VT, Expand); - - // GPU does not have divrem function for signed or unsigned. - setOperationAction(ISD::SDIVREM, VT, Custom); - setOperationAction(ISD::UDIVREM, VT, Custom); - - // GPU does not have [S|U]MUL_LOHI functions as a single instruction. - setOperationAction(ISD::SMUL_LOHI, VT, Expand); - setOperationAction(ISD::UMUL_LOHI, VT, Expand); - - setOperationAction(ISD::BSWAP, VT, Expand); - setOperationAction(ISD::CTTZ, VT, Expand); - setOperationAction(ISD::CTLZ, VT, Expand); - } - - if (!Subtarget->hasBCNT(32)) - setOperationAction(ISD::CTPOP, MVT::i32, Expand); - - if (!Subtarget->hasBCNT(64)) - setOperationAction(ISD::CTPOP, MVT::i64, Expand); - - // The hardware supports 32-bit ROTR, but not ROTL. - setOperationAction(ISD::ROTL, MVT::i32, Expand); - setOperationAction(ISD::ROTL, MVT::i64, Expand); - setOperationAction(ISD::ROTR, MVT::i64, Expand); - - setOperationAction(ISD::MUL, MVT::i64, Expand); - setOperationAction(ISD::MULHU, MVT::i64, Expand); - setOperationAction(ISD::MULHS, MVT::i64, Expand); - setOperationAction(ISD::UDIV, MVT::i32, Expand); - setOperationAction(ISD::UREM, MVT::i32, Expand); - setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom); - setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom); - setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom); - setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom); - setOperationAction(ISD::SELECT_CC, MVT::i64, Expand); - - setOperationAction(ISD::SMIN, MVT::i32, Legal); - setOperationAction(ISD::UMIN, MVT::i32, Legal); - setOperationAction(ISD::SMAX, MVT::i32, Legal); - setOperationAction(ISD::UMAX, MVT::i32, Legal); - - if (!Subtarget->hasFFBH()) - setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Expand); - - if (!Subtarget->hasFFBL()) - setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Expand); - - static const MVT::SimpleValueType VectorIntTypes[] = { - MVT::v2i32, MVT::v4i32 - }; - - for (MVT VT : VectorIntTypes) { - // Expand the following operations for the current type by default. - setOperationAction(ISD::ADD, VT, Expand); - setOperationAction(ISD::AND, VT, Expand); - setOperationAction(ISD::FP_TO_SINT, VT, Expand); - setOperationAction(ISD::FP_TO_UINT, VT, Expand); - setOperationAction(ISD::MUL, VT, Expand); - setOperationAction(ISD::OR, VT, Expand); - setOperationAction(ISD::SHL, VT, Expand); - setOperationAction(ISD::SRA, VT, Expand); - setOperationAction(ISD::SRL, VT, Expand); - setOperationAction(ISD::ROTL, VT, Expand); - setOperationAction(ISD::ROTR, VT, Expand); - setOperationAction(ISD::SUB, VT, Expand); - setOperationAction(ISD::SINT_TO_FP, VT, Expand); - setOperationAction(ISD::UINT_TO_FP, VT, Expand); - setOperationAction(ISD::SDIV, VT, Expand); - setOperationAction(ISD::UDIV, VT, Expand); - setOperationAction(ISD::SREM, VT, Expand); - setOperationAction(ISD::UREM, VT, Expand); - setOperationAction(ISD::SMUL_LOHI, VT, Expand); - setOperationAction(ISD::UMUL_LOHI, VT, Expand); - setOperationAction(ISD::SDIVREM, VT, Custom); - setOperationAction(ISD::UDIVREM, VT, Custom); - setOperationAction(ISD::ADDC, VT, Expand); - setOperationAction(ISD::SUBC, VT, Expand); - setOperationAction(ISD::ADDE, VT, Expand); - setOperationAction(ISD::SUBE, VT, Expand); - setOperationAction(ISD::SELECT, VT, Expand); - setOperationAction(ISD::VSELECT, VT, Expand); - setOperationAction(ISD::SELECT_CC, VT, Expand); - setOperationAction(ISD::XOR, VT, Expand); - setOperationAction(ISD::BSWAP, VT, Expand); - setOperationAction(ISD::CTPOP, VT, Expand); - setOperationAction(ISD::CTTZ, VT, Expand); - setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Expand); - setOperationAction(ISD::CTLZ, VT, Expand); - setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Expand); - setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand); - } - - static const MVT::SimpleValueType FloatVectorTypes[] = { - MVT::v2f32, MVT::v4f32 - }; - - for (MVT VT : FloatVectorTypes) { - setOperationAction(ISD::FABS, VT, Expand); - setOperationAction(ISD::FMINNUM, VT, Expand); - setOperationAction(ISD::FMAXNUM, VT, Expand); - setOperationAction(ISD::FADD, VT, Expand); - setOperationAction(ISD::FCEIL, VT, Expand); - setOperationAction(ISD::FCOS, VT, Expand); - setOperationAction(ISD::FDIV, VT, Expand); - setOperationAction(ISD::FEXP2, VT, Expand); - setOperationAction(ISD::FLOG2, VT, Expand); - setOperationAction(ISD::FREM, VT, Expand); - setOperationAction(ISD::FPOW, VT, Expand); - setOperationAction(ISD::FFLOOR, VT, Expand); - setOperationAction(ISD::FTRUNC, VT, Expand); - setOperationAction(ISD::FMUL, VT, Expand); - setOperationAction(ISD::FMA, VT, Expand); - setOperationAction(ISD::FRINT, VT, Expand); - setOperationAction(ISD::FNEARBYINT, VT, Expand); - setOperationAction(ISD::FSQRT, VT, Expand); - setOperationAction(ISD::FSIN, VT, Expand); - setOperationAction(ISD::FSUB, VT, Expand); - setOperationAction(ISD::FNEG, VT, Expand); - setOperationAction(ISD::SELECT, VT, Expand); - setOperationAction(ISD::VSELECT, VT, Expand); - setOperationAction(ISD::SELECT_CC, VT, Expand); - setOperationAction(ISD::FCOPYSIGN, VT, Expand); - setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand); - } - - setOperationAction(ISD::FNEARBYINT, MVT::f32, Custom); - setOperationAction(ISD::FNEARBYINT, MVT::f64, Custom); - - setTargetDAGCombine(ISD::MUL); - setTargetDAGCombine(ISD::SELECT); - setTargetDAGCombine(ISD::SELECT_CC); - setTargetDAGCombine(ISD::STORE); - - setTargetDAGCombine(ISD::FADD); - setTargetDAGCombine(ISD::FSUB); - - setBooleanContents(ZeroOrNegativeOneBooleanContent); - setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); - - setSchedulingPreference(Sched::RegPressure); - setJumpIsExpensive(true); - - // SI at least has hardware support for floating point exceptions, but no way - // of using or handling them is implemented. They are also optional in OpenCL - // (Section 7.3) - setHasFloatingPointExceptions(false); - - setSelectIsExpensive(false); - PredictableSelectIsExpensive = false; - - // There are no integer divide instructions, and these expand to a pretty - // large sequence of instructions. - setIntDivIsCheap(false); - setPow2SDivIsCheap(false); - setFsqrtIsCheap(true); - - // FIXME: Need to really handle these. - MaxStoresPerMemcpy = 4096; - MaxStoresPerMemmove = 4096; - MaxStoresPerMemset = 4096; -} - -//===----------------------------------------------------------------------===// -// Target Information -//===----------------------------------------------------------------------===// - -MVT AMDGPUTargetLowering::getVectorIdxTy() const { - return MVT::i32; -} - -bool AMDGPUTargetLowering::isSelectSupported(SelectSupportKind SelType) const { - return true; -} - -// The backend supports 32 and 64 bit floating point immediates. -// FIXME: Why are we reporting vectors of FP immediates as legal? -bool AMDGPUTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const { - EVT ScalarVT = VT.getScalarType(); - return (ScalarVT == MVT::f32 || ScalarVT == MVT::f64); -} - -// We don't want to shrink f64 / f32 constants. -bool AMDGPUTargetLowering::ShouldShrinkFPConstant(EVT VT) const { - EVT ScalarVT = VT.getScalarType(); - return (ScalarVT != MVT::f32 && ScalarVT != MVT::f64); -} - -bool AMDGPUTargetLowering::shouldReduceLoadWidth(SDNode *N, - ISD::LoadExtType, - EVT NewVT) const { - - unsigned NewSize = NewVT.getStoreSizeInBits(); - - // If we are reducing to a 32-bit load, this is always better. - if (NewSize == 32) - return true; - - EVT OldVT = N->getValueType(0); - unsigned OldSize = OldVT.getStoreSizeInBits(); - - // Don't produce extloads from sub 32-bit types. SI doesn't have scalar - // extloads, so doing one requires using a buffer_load. In cases where we - // still couldn't use a scalar load, using the wider load shouldn't really - // hurt anything. - - // If the old size already had to be an extload, there's no harm in continuing - // to reduce the width. - return (OldSize < 32); -} - -bool AMDGPUTargetLowering::isLoadBitCastBeneficial(EVT LoadTy, - EVT CastTy) const { - if (LoadTy.getSizeInBits() != CastTy.getSizeInBits()) - return true; - - unsigned LScalarSize = LoadTy.getScalarType().getSizeInBits(); - unsigned CastScalarSize = CastTy.getScalarType().getSizeInBits(); - - return ((LScalarSize <= CastScalarSize) || - (CastScalarSize >= 32) || - (LScalarSize < 32)); -} - -// SI+ has instructions for cttz / ctlz for 32-bit values. This is probably also -// profitable with the expansion for 64-bit since it's generally good to -// speculate things. -// FIXME: These should really have the size as a parameter. -bool AMDGPUTargetLowering::isCheapToSpeculateCttz() const { - return true; -} - -bool AMDGPUTargetLowering::isCheapToSpeculateCtlz() const { - return true; -} - -//===---------------------------------------------------------------------===// -// Target Properties -//===---------------------------------------------------------------------===// - -bool AMDGPUTargetLowering::isFAbsFree(EVT VT) const { - assert(VT.isFloatingPoint()); - return VT == MVT::f32 || VT == MVT::f64; -} - -bool AMDGPUTargetLowering::isFNegFree(EVT VT) const { - assert(VT.isFloatingPoint()); - return VT == MVT::f32 || VT == MVT::f64; -} - -bool AMDGPUTargetLowering:: storeOfVectorConstantIsCheap(EVT MemVT, - unsigned NumElem, - unsigned AS) const { - return true; -} - -bool AMDGPUTargetLowering::isTruncateFree(EVT Source, EVT Dest) const { - // Truncate is just accessing a subregister. - return Dest.bitsLT(Source) && (Dest.getSizeInBits() % 32 == 0); -} - -bool AMDGPUTargetLowering::isTruncateFree(Type *Source, Type *Dest) const { - // Truncate is just accessing a subregister. - return Dest->getPrimitiveSizeInBits() < Source->getPrimitiveSizeInBits() && - (Dest->getPrimitiveSizeInBits() % 32 == 0); -} - -bool AMDGPUTargetLowering::isZExtFree(Type *Src, Type *Dest) const { - const DataLayout *DL = getDataLayout(); - unsigned SrcSize = DL->getTypeSizeInBits(Src->getScalarType()); - unsigned DestSize = DL->getTypeSizeInBits(Dest->getScalarType()); - - return SrcSize == 32 && DestSize == 64; -} - -bool AMDGPUTargetLowering::isZExtFree(EVT Src, EVT Dest) const { - // Any register load of a 64-bit value really requires 2 32-bit moves. For all - // practical purposes, the extra mov 0 to load a 64-bit is free. As used, - // this will enable reducing 64-bit operations the 32-bit, which is always - // good. - return Src == MVT::i32 && Dest == MVT::i64; -} - -bool AMDGPUTargetLowering::isZExtFree(SDValue Val, EVT VT2) const { - return isZExtFree(Val.getValueType(), VT2); -} - -bool AMDGPUTargetLowering::isNarrowingProfitable(EVT SrcVT, EVT DestVT) const { - // There aren't really 64-bit registers, but pairs of 32-bit ones and only a - // limited number of native 64-bit operations. Shrinking an operation to fit - // in a single 32-bit register should always be helpful. As currently used, - // this is much less general than the name suggests, and is only used in - // places trying to reduce the sizes of loads. Shrinking loads to < 32-bits is - // not profitable, and may actually be harmful. - return SrcVT.getSizeInBits() > 32 && DestVT.getSizeInBits() == 32; -} - -//===---------------------------------------------------------------------===// -// TargetLowering Callbacks -//===---------------------------------------------------------------------===// - -void AMDGPUTargetLowering::AnalyzeFormalArguments(CCState &State, - const SmallVectorImpl &Ins) const { - - State.AnalyzeFormalArguments(Ins, CC_AMDGPU); -} - -SDValue AMDGPUTargetLowering::LowerReturn( - SDValue Chain, - CallingConv::ID CallConv, - bool isVarArg, - const SmallVectorImpl &Outs, - const SmallVectorImpl &OutVals, - SDLoc DL, SelectionDAG &DAG) const { - return DAG.getNode(AMDGPUISD::RET_FLAG, DL, MVT::Other, Chain); -} - -//===---------------------------------------------------------------------===// -// Target specific lowering -//===---------------------------------------------------------------------===// - -SDValue AMDGPUTargetLowering::LowerCall(CallLoweringInfo &CLI, - SmallVectorImpl &InVals) const { - SDValue Callee = CLI.Callee; - SelectionDAG &DAG = CLI.DAG; - - const Function &Fn = *DAG.getMachineFunction().getFunction(); - - StringRef FuncName(""); - - if (const ExternalSymbolSDNode *G = dyn_cast(Callee)) - FuncName = G->getSymbol(); - else if (const GlobalAddressSDNode *G = dyn_cast(Callee)) - FuncName = G->getGlobal()->getName(); - - DiagnosticInfoUnsupported NoCalls(Fn, "call to function " + FuncName); - DAG.getContext()->diagnose(NoCalls); - return SDValue(); -} - -SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op, - SelectionDAG &DAG) const { - switch (Op.getOpcode()) { - default: - Op.getNode()->dump(); - llvm_unreachable("Custom lowering code for this" - "instruction is not implemented yet!"); - break; - case ISD::SIGN_EXTEND_INREG: return LowerSIGN_EXTEND_INREG(Op, DAG); - case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG); - case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG); - case ISD::FrameIndex: return LowerFrameIndex(Op, DAG); - case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG); - case ISD::UDIVREM: return LowerUDIVREM(Op, DAG); - case ISD::SDIVREM: return LowerSDIVREM(Op, DAG); - case ISD::FREM: return LowerFREM(Op, DAG); - case ISD::FCEIL: return LowerFCEIL(Op, DAG); - case ISD::FTRUNC: return LowerFTRUNC(Op, DAG); - case ISD::FRINT: return LowerFRINT(Op, DAG); - case ISD::FNEARBYINT: return LowerFNEARBYINT(Op, DAG); - case ISD::FROUND: return LowerFROUND(Op, DAG); - case ISD::FFLOOR: return LowerFFLOOR(Op, DAG); - case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG); - case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG); - case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG); - case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG); - } - return Op; -} - -void AMDGPUTargetLowering::ReplaceNodeResults(SDNode *N, - SmallVectorImpl &Results, - SelectionDAG &DAG) const { - switch (N->getOpcode()) { - case ISD::SIGN_EXTEND_INREG: - // Different parts of legalization seem to interpret which type of - // sign_extend_inreg is the one to check for custom lowering. The extended - // from type is what really matters, but some places check for custom - // lowering of the result type. This results in trying to use - // ReplaceNodeResults to sext_in_reg to an illegal type, so we'll just do - // nothing here and let the illegal result integer be handled normally. - return; - case ISD::LOAD: { - SDNode *Node = LowerLOAD(SDValue(N, 0), DAG).getNode(); - if (!Node) - return; - - Results.push_back(SDValue(Node, 0)); - Results.push_back(SDValue(Node, 1)); - // XXX: LLVM seems not to replace Chain Value inside CustomWidenLowerNode - // function - DAG.ReplaceAllUsesOfValueWith(SDValue(N,1), SDValue(Node, 1)); - return; - } - case ISD::STORE: { - SDValue Lowered = LowerSTORE(SDValue(N, 0), DAG); - if (Lowered.getNode()) - Results.push_back(Lowered); - return; - } - default: - return; - } -} - -// FIXME: This implements accesses to initialized globals in the constant -// address space by copying them to private and accessing that. It does not -// properly handle illegal types or vectors. The private vector loads are not -// scalarized, and the illegal scalars hit an assertion. This technique will not -// work well with large initializers, and this should eventually be -// removed. Initialized globals should be placed into a data section that the -// runtime will load into a buffer before the kernel is executed. Uses of the -// global need to be replaced with a pointer loaded from an implicit kernel -// argument into this buffer holding the copy of the data, which will remove the -// need for any of this. -SDValue AMDGPUTargetLowering::LowerConstantInitializer(const Constant* Init, - const GlobalValue *GV, - const SDValue &InitPtr, - SDValue Chain, - SelectionDAG &DAG) const { - const DataLayout *TD = getDataLayout(); - SDLoc DL(InitPtr); - Type *InitTy = Init->getType(); - - if (const ConstantInt *CI = dyn_cast(Init)) { - EVT VT = EVT::getEVT(InitTy); - PointerType *PtrTy = PointerType::get(InitTy, AMDGPUAS::PRIVATE_ADDRESS); - return DAG.getStore(Chain, DL, DAG.getConstant(*CI, DL, VT), InitPtr, - MachinePointerInfo(UndefValue::get(PtrTy)), false, false, - TD->getPrefTypeAlignment(InitTy)); - } - - if (const ConstantFP *CFP = dyn_cast(Init)) { - EVT VT = EVT::getEVT(CFP->getType()); - PointerType *PtrTy = PointerType::get(CFP->getType(), 0); - return DAG.getStore(Chain, DL, DAG.getConstantFP(*CFP, DL, VT), InitPtr, - MachinePointerInfo(UndefValue::get(PtrTy)), false, false, - TD->getPrefTypeAlignment(CFP->getType())); - } - - if (StructType *ST = dyn_cast(InitTy)) { - const StructLayout *SL = TD->getStructLayout(ST); - - EVT PtrVT = InitPtr.getValueType(); - SmallVector Chains; - - for (unsigned I = 0, N = ST->getNumElements(); I != N; ++I) { - SDValue Offset = DAG.getConstant(SL->getElementOffset(I), DL, PtrVT); - SDValue Ptr = DAG.getNode(ISD::ADD, DL, PtrVT, InitPtr, Offset); - - Constant *Elt = Init->getAggregateElement(I); - Chains.push_back(LowerConstantInitializer(Elt, GV, Ptr, Chain, DAG)); - } - - return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains); - } - - if (SequentialType *SeqTy = dyn_cast(InitTy)) { - EVT PtrVT = InitPtr.getValueType(); - - unsigned NumElements; - if (ArrayType *AT = dyn_cast(SeqTy)) - NumElements = AT->getNumElements(); - else if (VectorType *VT = dyn_cast(SeqTy)) - NumElements = VT->getNumElements(); - else - llvm_unreachable("Unexpected type"); - - unsigned EltSize = TD->getTypeAllocSize(SeqTy->getElementType()); - SmallVector Chains; - for (unsigned i = 0; i < NumElements; ++i) { - SDValue Offset = DAG.getConstant(i * EltSize, DL, PtrVT); - SDValue Ptr = DAG.getNode(ISD::ADD, DL, PtrVT, InitPtr, Offset); - - Constant *Elt = Init->getAggregateElement(i); - Chains.push_back(LowerConstantInitializer(Elt, GV, Ptr, Chain, DAG)); - } - - return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains); - } - - if (isa(Init)) { - EVT VT = EVT::getEVT(InitTy); - PointerType *PtrTy = PointerType::get(InitTy, AMDGPUAS::PRIVATE_ADDRESS); - return DAG.getStore(Chain, DL, DAG.getUNDEF(VT), InitPtr, - MachinePointerInfo(UndefValue::get(PtrTy)), false, false, - TD->getPrefTypeAlignment(InitTy)); - } - - Init->dump(); - llvm_unreachable("Unhandled constant initializer"); -} - -static bool hasDefinedInitializer(const GlobalValue *GV) { - const GlobalVariable *GVar = dyn_cast(GV); - if (!GVar || !GVar->hasInitializer()) - return false; - - if (isa(GVar->getInitializer())) - return false; - - return true; -} - -SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI, - SDValue Op, - SelectionDAG &DAG) const { - - const DataLayout *TD = getDataLayout(); - GlobalAddressSDNode *G = cast(Op); - const GlobalValue *GV = G->getGlobal(); - - switch (G->getAddressSpace()) { - case AMDGPUAS::LOCAL_ADDRESS: { - // XXX: What does the value of G->getOffset() mean? - assert(G->getOffset() == 0 && - "Do not know what to do with an non-zero offset"); - - // TODO: We could emit code to handle the initialization somewhere. - if (hasDefinedInitializer(GV)) - break; - - unsigned Offset; - if (MFI->LocalMemoryObjects.count(GV) == 0) { - uint64_t Size = TD->getTypeAllocSize(GV->getType()->getElementType()); - Offset = MFI->LDSSize; - MFI->LocalMemoryObjects[GV] = Offset; - // XXX: Account for alignment? - MFI->LDSSize += Size; - } else { - Offset = MFI->LocalMemoryObjects[GV]; - } - - return DAG.getConstant(Offset, SDLoc(Op), - getPointerTy(AMDGPUAS::LOCAL_ADDRESS)); - } - case AMDGPUAS::CONSTANT_ADDRESS: { - MachineFrameInfo *FrameInfo = DAG.getMachineFunction().getFrameInfo(); - Type *EltType = GV->getType()->getElementType(); - unsigned Size = TD->getTypeAllocSize(EltType); - unsigned Alignment = TD->getPrefTypeAlignment(EltType); - - MVT PrivPtrVT = getPointerTy(AMDGPUAS::PRIVATE_ADDRESS); - MVT ConstPtrVT = getPointerTy(AMDGPUAS::CONSTANT_ADDRESS); - - int FI = FrameInfo->CreateStackObject(Size, Alignment, false); - SDValue InitPtr = DAG.getFrameIndex(FI, PrivPtrVT); - - const GlobalVariable *Var = cast(GV); - if (!Var->hasInitializer()) { - // This has no use, but bugpoint will hit it. - return DAG.getZExtOrTrunc(InitPtr, SDLoc(Op), ConstPtrVT); - } - - const Constant *Init = Var->getInitializer(); - SmallVector WorkList; - - for (SDNode::use_iterator I = DAG.getEntryNode()->use_begin(), - E = DAG.getEntryNode()->use_end(); I != E; ++I) { - if (I->getOpcode() != AMDGPUISD::REGISTER_LOAD && I->getOpcode() != ISD::LOAD) - continue; - WorkList.push_back(*I); - } - SDValue Chain = LowerConstantInitializer(Init, GV, InitPtr, DAG.getEntryNode(), DAG); - for (SmallVector::iterator I = WorkList.begin(), - E = WorkList.end(); I != E; ++I) { - SmallVector Ops; - Ops.push_back(Chain); - for (unsigned i = 1; i < (*I)->getNumOperands(); ++i) { - Ops.push_back((*I)->getOperand(i)); - } - DAG.UpdateNodeOperands(*I, Ops); - } - return DAG.getZExtOrTrunc(InitPtr, SDLoc(Op), ConstPtrVT); - } - } - - const Function &Fn = *DAG.getMachineFunction().getFunction(); - DiagnosticInfoUnsupported BadInit(Fn, - "initializer for address space"); - DAG.getContext()->diagnose(BadInit); - return SDValue(); -} - -SDValue AMDGPUTargetLowering::LowerCONCAT_VECTORS(SDValue Op, - SelectionDAG &DAG) const { - SmallVector Args; - - for (const SDUse &U : Op->ops()) - DAG.ExtractVectorElements(U.get(), Args); - - return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(Op), Op.getValueType(), Args); -} - -SDValue AMDGPUTargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op, - SelectionDAG &DAG) const { - - SmallVector Args; - unsigned Start = cast(Op.getOperand(1))->getZExtValue(); - EVT VT = Op.getValueType(); - DAG.ExtractVectorElements(Op.getOperand(0), Args, Start, - VT.getVectorNumElements()); - - return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(Op), Op.getValueType(), Args); -} - -SDValue AMDGPUTargetLowering::LowerFrameIndex(SDValue Op, - SelectionDAG &DAG) const { - - MachineFunction &MF = DAG.getMachineFunction(); - const AMDGPUFrameLowering *TFL = Subtarget->getFrameLowering(); - - FrameIndexSDNode *FIN = cast(Op); - - unsigned FrameIndex = FIN->getIndex(); - unsigned Offset = TFL->getFrameIndexOffset(MF, FrameIndex); - return DAG.getConstant(Offset * 4 * TFL->getStackWidth(MF), SDLoc(Op), - Op.getValueType()); -} - -SDValue AMDGPUTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, - SelectionDAG &DAG) const { - unsigned IntrinsicID = cast(Op.getOperand(0))->getZExtValue(); - SDLoc DL(Op); - EVT VT = Op.getValueType(); - - switch (IntrinsicID) { - default: return Op; - case AMDGPUIntrinsic::AMDGPU_abs: - case AMDGPUIntrinsic::AMDIL_abs: // Legacy name. - return LowerIntrinsicIABS(Op, DAG); - case AMDGPUIntrinsic::AMDGPU_lrp: - return LowerIntrinsicLRP(Op, DAG); - - case AMDGPUIntrinsic::AMDGPU_clamp: - case AMDGPUIntrinsic::AMDIL_clamp: // Legacy name. - return DAG.getNode(AMDGPUISD::CLAMP, DL, VT, - Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); - - case Intrinsic::AMDGPU_div_scale: { - // 3rd parameter required to be a constant. - const ConstantSDNode *Param = dyn_cast(Op.getOperand(3)); - if (!Param) - return DAG.getUNDEF(VT); - - // Translate to the operands expected by the machine instruction. The - // first parameter must be the same as the first instruction. - SDValue Numerator = Op.getOperand(1); - SDValue Denominator = Op.getOperand(2); - - // Note this order is opposite of the machine instruction's operations, - // which is s0.f = Quotient, s1.f = Denominator, s2.f = Numerator. The - // intrinsic has the numerator as the first operand to match a normal - // division operation. - - SDValue Src0 = Param->isAllOnesValue() ? Numerator : Denominator; - - return DAG.getNode(AMDGPUISD::DIV_SCALE, DL, Op->getVTList(), Src0, - Denominator, Numerator); - } - - case Intrinsic::AMDGPU_div_fmas: - return DAG.getNode(AMDGPUISD::DIV_FMAS, DL, VT, - Op.getOperand(1), Op.getOperand(2), Op.getOperand(3), - Op.getOperand(4)); - - case Intrinsic::AMDGPU_div_fixup: - return DAG.getNode(AMDGPUISD::DIV_FIXUP, DL, VT, - Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); - - case Intrinsic::AMDGPU_trig_preop: - return DAG.getNode(AMDGPUISD::TRIG_PREOP, DL, VT, - Op.getOperand(1), Op.getOperand(2)); - - case Intrinsic::AMDGPU_rcp: - return DAG.getNode(AMDGPUISD::RCP, DL, VT, Op.getOperand(1)); - - case Intrinsic::AMDGPU_rsq: - return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1)); - - case AMDGPUIntrinsic::AMDGPU_legacy_rsq: - return DAG.getNode(AMDGPUISD::RSQ_LEGACY, DL, VT, Op.getOperand(1)); - - case Intrinsic::AMDGPU_rsq_clamped: - if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { - Type *Type = VT.getTypeForEVT(*DAG.getContext()); - APFloat Max = APFloat::getLargest(Type->getFltSemantics()); - APFloat Min = APFloat::getLargest(Type->getFltSemantics(), true); - - SDValue Rsq = DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1)); - SDValue Tmp = DAG.getNode(ISD::FMINNUM, DL, VT, Rsq, - DAG.getConstantFP(Max, DL, VT)); - return DAG.getNode(ISD::FMAXNUM, DL, VT, Tmp, - DAG.getConstantFP(Min, DL, VT)); - } else { - return DAG.getNode(AMDGPUISD::RSQ_CLAMPED, DL, VT, Op.getOperand(1)); - } - - case Intrinsic::AMDGPU_ldexp: - return DAG.getNode(AMDGPUISD::LDEXP, DL, VT, Op.getOperand(1), - Op.getOperand(2)); - - case AMDGPUIntrinsic::AMDGPU_imax: - return DAG.getNode(ISD::SMAX, DL, VT, Op.getOperand(1), - Op.getOperand(2)); - case AMDGPUIntrinsic::AMDGPU_umax: - return DAG.getNode(ISD::UMAX, DL, VT, Op.getOperand(1), - Op.getOperand(2)); - case AMDGPUIntrinsic::AMDGPU_imin: - return DAG.getNode(ISD::SMIN, DL, VT, Op.getOperand(1), - Op.getOperand(2)); - case AMDGPUIntrinsic::AMDGPU_umin: - return DAG.getNode(ISD::UMIN, DL, VT, Op.getOperand(1), - Op.getOperand(2)); - - case AMDGPUIntrinsic::AMDGPU_umul24: - return DAG.getNode(AMDGPUISD::MUL_U24, DL, VT, - Op.getOperand(1), Op.getOperand(2)); - - case AMDGPUIntrinsic::AMDGPU_imul24: - return DAG.getNode(AMDGPUISD::MUL_I24, DL, VT, - Op.getOperand(1), Op.getOperand(2)); - - case AMDGPUIntrinsic::AMDGPU_umad24: - return DAG.getNode(AMDGPUISD::MAD_U24, DL, VT, - Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); - - case AMDGPUIntrinsic::AMDGPU_imad24: - return DAG.getNode(AMDGPUISD::MAD_I24, DL, VT, - Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); - - case AMDGPUIntrinsic::AMDGPU_cvt_f32_ubyte0: - return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, VT, Op.getOperand(1)); - - case AMDGPUIntrinsic::AMDGPU_cvt_f32_ubyte1: - return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE1, DL, VT, Op.getOperand(1)); - - case AMDGPUIntrinsic::AMDGPU_cvt_f32_ubyte2: - return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE2, DL, VT, Op.getOperand(1)); - - case AMDGPUIntrinsic::AMDGPU_cvt_f32_ubyte3: - return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE3, DL, VT, Op.getOperand(1)); - - case AMDGPUIntrinsic::AMDGPU_bfe_i32: - return DAG.getNode(AMDGPUISD::BFE_I32, DL, VT, - Op.getOperand(1), - Op.getOperand(2), - Op.getOperand(3)); - - case AMDGPUIntrinsic::AMDGPU_bfe_u32: - return DAG.getNode(AMDGPUISD::BFE_U32, DL, VT, - Op.getOperand(1), - Op.getOperand(2), - Op.getOperand(3)); - - case AMDGPUIntrinsic::AMDGPU_bfi: - return DAG.getNode(AMDGPUISD::BFI, DL, VT, - Op.getOperand(1), - Op.getOperand(2), - Op.getOperand(3)); - - case AMDGPUIntrinsic::AMDGPU_bfm: - return DAG.getNode(AMDGPUISD::BFM, DL, VT, - Op.getOperand(1), - Op.getOperand(2)); - - case AMDGPUIntrinsic::AMDGPU_brev: - return DAG.getNode(AMDGPUISD::BREV, DL, VT, Op.getOperand(1)); - - case Intrinsic::AMDGPU_class: - return DAG.getNode(AMDGPUISD::FP_CLASS, DL, VT, - Op.getOperand(1), Op.getOperand(2)); - - case AMDGPUIntrinsic::AMDIL_exp: // Legacy name. - return DAG.getNode(ISD::FEXP2, DL, VT, Op.getOperand(1)); - - case AMDGPUIntrinsic::AMDIL_round_nearest: // Legacy name. - return DAG.getNode(ISD::FRINT, DL, VT, Op.getOperand(1)); - case AMDGPUIntrinsic::AMDGPU_trunc: // Legacy name. - return DAG.getNode(ISD::FTRUNC, DL, VT, Op.getOperand(1)); - } -} - -///IABS(a) = SMAX(sub(0, a), a) -SDValue AMDGPUTargetLowering::LowerIntrinsicIABS(SDValue Op, - SelectionDAG &DAG) const { - SDLoc DL(Op); - EVT VT = Op.getValueType(); - SDValue Neg = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), - Op.getOperand(1)); - - return DAG.getNode(ISD::SMAX, DL, VT, Neg, Op.getOperand(1)); -} - -/// Linear Interpolation -/// LRP(a, b, c) = muladd(a, b, (1 - a) * c) -SDValue AMDGPUTargetLowering::LowerIntrinsicLRP(SDValue Op, - SelectionDAG &DAG) const { - SDLoc DL(Op); - EVT VT = Op.getValueType(); - SDValue OneSubA = DAG.getNode(ISD::FSUB, DL, VT, - DAG.getConstantFP(1.0f, DL, MVT::f32), - Op.getOperand(1)); - SDValue OneSubAC = DAG.getNode(ISD::FMUL, DL, VT, OneSubA, - Op.getOperand(3)); - return DAG.getNode(ISD::FADD, DL, VT, - DAG.getNode(ISD::FMUL, DL, VT, Op.getOperand(1), Op.getOperand(2)), - OneSubAC); -} - -/// \brief Generate Min/Max node -SDValue AMDGPUTargetLowering::CombineFMinMaxLegacy(SDLoc DL, - EVT VT, - SDValue LHS, - SDValue RHS, - SDValue True, - SDValue False, - SDValue CC, - DAGCombinerInfo &DCI) const { - if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) - return SDValue(); - - if (!(LHS == True && RHS == False) && !(LHS == False && RHS == True)) - return SDValue(); - - SelectionDAG &DAG = DCI.DAG; - ISD::CondCode CCOpcode = cast(CC)->get(); - switch (CCOpcode) { - case ISD::SETOEQ: - case ISD::SETONE: - case ISD::SETUNE: - case ISD::SETNE: - case ISD::SETUEQ: - case ISD::SETEQ: - case ISD::SETFALSE: - case ISD::SETFALSE2: - case ISD::SETTRUE: - case ISD::SETTRUE2: - case ISD::SETUO: - case ISD::SETO: - break; - case ISD::SETULE: - case ISD::SETULT: { - if (LHS == True) - return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS); - return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS); - } - case ISD::SETOLE: - case ISD::SETOLT: - case ISD::SETLE: - case ISD::SETLT: { - // Ordered. Assume ordered for undefined. - - // Only do this after legalization to avoid interfering with other combines - // which might occur. - if (DCI.getDAGCombineLevel() < AfterLegalizeDAG && - !DCI.isCalledByLegalizer()) - return SDValue(); - - // We need to permute the operands to get the correct NaN behavior. The - // selected operand is the second one based on the failing compare with NaN, - // so permute it based on the compare type the hardware uses. - if (LHS == True) - return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS); - return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS); - } - case ISD::SETUGE: - case ISD::SETUGT: { - if (LHS == True) - return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS); - return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS); - } - case ISD::SETGT: - case ISD::SETGE: - case ISD::SETOGE: - case ISD::SETOGT: { - if (DCI.getDAGCombineLevel() < AfterLegalizeDAG && - !DCI.isCalledByLegalizer()) - return SDValue(); - - if (LHS == True) - return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS); - return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS); - } - case ISD::SETCC_INVALID: - llvm_unreachable("Invalid setcc condcode!"); - } - return SDValue(); -} - -// FIXME: Remove this when combines added to DAGCombiner. -SDValue AMDGPUTargetLowering::CombineIMinMax(SDLoc DL, - EVT VT, - SDValue LHS, - SDValue RHS, - SDValue True, - SDValue False, - SDValue CC, - SelectionDAG &DAG) const { - if (!(LHS == True && RHS == False) && !(LHS == False && RHS == True)) - return SDValue(); - - ISD::CondCode CCOpcode = cast(CC)->get(); - switch (CCOpcode) { - case ISD::SETULE: - case ISD::SETULT: { - unsigned Opc = (LHS == True) ? ISD::UMIN : ISD::UMAX; - return DAG.getNode(Opc, DL, VT, LHS, RHS); - } - case ISD::SETLE: - case ISD::SETLT: { - unsigned Opc = (LHS == True) ? ISD::SMIN : ISD::SMAX; - return DAG.getNode(Opc, DL, VT, LHS, RHS); - } - case ISD::SETGT: - case ISD::SETGE: { - unsigned Opc = (LHS == True) ? ISD::SMAX : ISD::SMIN; - return DAG.getNode(Opc, DL, VT, LHS, RHS); - } - case ISD::SETUGE: - case ISD::SETUGT: { - unsigned Opc = (LHS == True) ? ISD::UMAX : ISD::UMIN; - return DAG.getNode(Opc, DL, VT, LHS, RHS); - } - default: - return SDValue(); - } -} - -SDValue AMDGPUTargetLowering::ScalarizeVectorLoad(const SDValue Op, - SelectionDAG &DAG) const { - LoadSDNode *Load = cast(Op); - EVT MemVT = Load->getMemoryVT(); - EVT MemEltVT = MemVT.getVectorElementType(); - - EVT LoadVT = Op.getValueType(); - EVT EltVT = LoadVT.getVectorElementType(); - EVT PtrVT = Load->getBasePtr().getValueType(); - - unsigned NumElts = Load->getMemoryVT().getVectorNumElements(); - SmallVector Loads; - SmallVector Chains; - - SDLoc SL(Op); - unsigned MemEltSize = MemEltVT.getStoreSize(); - MachinePointerInfo SrcValue(Load->getMemOperand()->getValue()); - - for (unsigned i = 0; i < NumElts; ++i) { - SDValue Ptr = DAG.getNode(ISD::ADD, SL, PtrVT, Load->getBasePtr(), - DAG.getConstant(i * MemEltSize, SL, PtrVT)); - - SDValue NewLoad - = DAG.getExtLoad(Load->getExtensionType(), SL, EltVT, - Load->getChain(), Ptr, - SrcValue.getWithOffset(i * MemEltSize), - MemEltVT, Load->isVolatile(), Load->isNonTemporal(), - Load->isInvariant(), Load->getAlignment()); - Loads.push_back(NewLoad.getValue(0)); - Chains.push_back(NewLoad.getValue(1)); - } - - SDValue Ops[] = { - DAG.getNode(ISD::BUILD_VECTOR, SL, LoadVT, Loads), - DAG.getNode(ISD::TokenFactor, SL, MVT::Other, Chains) - }; - - return DAG.getMergeValues(Ops, SL); -} - -SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue Op, - SelectionDAG &DAG) const { - EVT VT = Op.getValueType(); - - // If this is a 2 element vector, we really want to scalarize and not create - // weird 1 element vectors. - if (VT.getVectorNumElements() == 2) - return ScalarizeVectorLoad(Op, DAG); - - LoadSDNode *Load = cast(Op); - SDValue BasePtr = Load->getBasePtr(); - EVT PtrVT = BasePtr.getValueType(); - EVT MemVT = Load->getMemoryVT(); - SDLoc SL(Op); - MachinePointerInfo SrcValue(Load->getMemOperand()->getValue()); - - EVT LoVT, HiVT; - EVT LoMemVT, HiMemVT; - SDValue Lo, Hi; - - std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT); - std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemVT); - std::tie(Lo, Hi) = DAG.SplitVector(Op, SL, LoVT, HiVT); - SDValue LoLoad - = DAG.getExtLoad(Load->getExtensionType(), SL, LoVT, - Load->getChain(), BasePtr, - SrcValue, - LoMemVT, Load->isVolatile(), Load->isNonTemporal(), - Load->isInvariant(), Load->getAlignment()); - - SDValue HiPtr = DAG.getNode(ISD::ADD, SL, PtrVT, BasePtr, - DAG.getConstant(LoMemVT.getStoreSize(), SL, - PtrVT)); - - SDValue HiLoad - = DAG.getExtLoad(Load->getExtensionType(), SL, HiVT, - Load->getChain(), HiPtr, - SrcValue.getWithOffset(LoMemVT.getStoreSize()), - HiMemVT, Load->isVolatile(), Load->isNonTemporal(), - Load->isInvariant(), Load->getAlignment()); - - SDValue Ops[] = { - DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, LoLoad, HiLoad), - DAG.getNode(ISD::TokenFactor, SL, MVT::Other, - LoLoad.getValue(1), HiLoad.getValue(1)) - }; - - return DAG.getMergeValues(Ops, SL); -} - -SDValue AMDGPUTargetLowering::MergeVectorStore(const SDValue &Op, - SelectionDAG &DAG) const { - StoreSDNode *Store = cast(Op); - EVT MemVT = Store->getMemoryVT(); - unsigned MemBits = MemVT.getSizeInBits(); - - // Byte stores are really expensive, so if possible, try to pack 32-bit vector - // truncating store into an i32 store. - // XXX: We could also handle optimize other vector bitwidths. - if (!MemVT.isVector() || MemBits > 32) { - return SDValue(); - } - - SDLoc DL(Op); - SDValue Value = Store->getValue(); - EVT VT = Value.getValueType(); - EVT ElemVT = VT.getVectorElementType(); - SDValue Ptr = Store->getBasePtr(); - EVT MemEltVT = MemVT.getVectorElementType(); - unsigned MemEltBits = MemEltVT.getSizeInBits(); - unsigned MemNumElements = MemVT.getVectorNumElements(); - unsigned PackedSize = MemVT.getStoreSizeInBits(); - SDValue Mask = DAG.getConstant((1 << MemEltBits) - 1, DL, MVT::i32); - - assert(Value.getValueType().getScalarSizeInBits() >= 32); - - SDValue PackedValue; - for (unsigned i = 0; i < MemNumElements; ++i) { - SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ElemVT, Value, - DAG.getConstant(i, DL, MVT::i32)); - Elt = DAG.getZExtOrTrunc(Elt, DL, MVT::i32); - Elt = DAG.getNode(ISD::AND, DL, MVT::i32, Elt, Mask); // getZeroExtendInReg - - SDValue Shift = DAG.getConstant(MemEltBits * i, DL, MVT::i32); - Elt = DAG.getNode(ISD::SHL, DL, MVT::i32, Elt, Shift); - - if (i == 0) { - PackedValue = Elt; - } else { - PackedValue = DAG.getNode(ISD::OR, DL, MVT::i32, PackedValue, Elt); - } - } - - if (PackedSize < 32) { - EVT PackedVT = EVT::getIntegerVT(*DAG.getContext(), PackedSize); - return DAG.getTruncStore(Store->getChain(), DL, PackedValue, Ptr, - Store->getMemOperand()->getPointerInfo(), - PackedVT, - Store->isNonTemporal(), Store->isVolatile(), - Store->getAlignment()); - } - - return DAG.getStore(Store->getChain(), DL, PackedValue, Ptr, - Store->getMemOperand()->getPointerInfo(), - Store->isVolatile(), Store->isNonTemporal(), - Store->getAlignment()); -} - -SDValue AMDGPUTargetLowering::ScalarizeVectorStore(SDValue Op, - SelectionDAG &DAG) const { - StoreSDNode *Store = cast(Op); - EVT MemEltVT = Store->getMemoryVT().getVectorElementType(); - EVT EltVT = Store->getValue().getValueType().getVectorElementType(); - EVT PtrVT = Store->getBasePtr().getValueType(); - unsigned NumElts = Store->getMemoryVT().getVectorNumElements(); - SDLoc SL(Op); - - SmallVector Chains; - - unsigned EltSize = MemEltVT.getStoreSize(); - MachinePointerInfo SrcValue(Store->getMemOperand()->getValue()); - - for (unsigned i = 0, e = NumElts; i != e; ++i) { - SDValue Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, - Store->getValue(), - DAG.getConstant(i, SL, MVT::i32)); - - SDValue Offset = DAG.getConstant(i * MemEltVT.getStoreSize(), SL, PtrVT); - SDValue Ptr = DAG.getNode(ISD::ADD, SL, PtrVT, Store->getBasePtr(), Offset); - SDValue NewStore = - DAG.getTruncStore(Store->getChain(), SL, Val, Ptr, - SrcValue.getWithOffset(i * EltSize), - MemEltVT, Store->isNonTemporal(), Store->isVolatile(), - Store->getAlignment()); - Chains.push_back(NewStore); - } - - return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, Chains); -} - -SDValue AMDGPUTargetLowering::SplitVectorStore(SDValue Op, - SelectionDAG &DAG) const { - StoreSDNode *Store = cast(Op); - SDValue Val = Store->getValue(); - EVT VT = Val.getValueType(); - - // If this is a 2 element vector, we really want to scalarize and not create - // weird 1 element vectors. - if (VT.getVectorNumElements() == 2) - return ScalarizeVectorStore(Op, DAG); - - EVT MemVT = Store->getMemoryVT(); - SDValue Chain = Store->getChain(); - SDValue BasePtr = Store->getBasePtr(); - SDLoc SL(Op); - - EVT LoVT, HiVT; - EVT LoMemVT, HiMemVT; - SDValue Lo, Hi; - - std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT); - std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemVT); - std::tie(Lo, Hi) = DAG.SplitVector(Val, SL, LoVT, HiVT); - - EVT PtrVT = BasePtr.getValueType(); - SDValue HiPtr = DAG.getNode(ISD::ADD, SL, PtrVT, BasePtr, - DAG.getConstant(LoMemVT.getStoreSize(), SL, - PtrVT)); - - MachinePointerInfo SrcValue(Store->getMemOperand()->getValue()); - SDValue LoStore - = DAG.getTruncStore(Chain, SL, Lo, - BasePtr, - SrcValue, - LoMemVT, - Store->isNonTemporal(), - Store->isVolatile(), - Store->getAlignment()); - SDValue HiStore - = DAG.getTruncStore(Chain, SL, Hi, - HiPtr, - SrcValue.getWithOffset(LoMemVT.getStoreSize()), - HiMemVT, - Store->isNonTemporal(), - Store->isVolatile(), - Store->getAlignment()); - - return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, LoStore, HiStore); -} - - -SDValue AMDGPUTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { - SDLoc DL(Op); - LoadSDNode *Load = cast(Op); - ISD::LoadExtType ExtType = Load->getExtensionType(); - EVT VT = Op.getValueType(); - EVT MemVT = Load->getMemoryVT(); - - if (ExtType == ISD::NON_EXTLOAD && VT.getSizeInBits() < 32) { - assert(VT == MVT::i1 && "Only i1 non-extloads expected"); - // FIXME: Copied from PPC - // First, load into 32 bits, then truncate to 1 bit. - - SDValue Chain = Load->getChain(); - SDValue BasePtr = Load->getBasePtr(); - MachineMemOperand *MMO = Load->getMemOperand(); - - SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain, - BasePtr, MVT::i8, MMO); - - SDValue Ops[] = { - DAG.getNode(ISD::TRUNCATE, DL, VT, NewLD), - NewLD.getValue(1) - }; - - return DAG.getMergeValues(Ops, DL); - } - - if (Subtarget->getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS || - Load->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS || - ExtType == ISD::NON_EXTLOAD || Load->getMemoryVT().bitsGE(MVT::i32)) - return SDValue(); - - // getBasePtr(), - DAG.getConstant(2, DL, MVT::i32)); - // Load the Register. - SDValue Ret = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, Op.getValueType(), - Load->getChain(), Ptr, - DAG.getTargetConstant(0, DL, MVT::i32), - Op.getOperand(2)); - - // Get offset within the register. - SDValue ByteIdx = DAG.getNode(ISD::AND, DL, MVT::i32, - Load->getBasePtr(), - DAG.getConstant(0x3, DL, MVT::i32)); - - // Bit offset of target byte (byteIdx * 8). - SDValue ShiftAmt = DAG.getNode(ISD::SHL, DL, MVT::i32, ByteIdx, - DAG.getConstant(3, DL, MVT::i32)); - - // Shift to the right. - Ret = DAG.getNode(ISD::SRL, DL, MVT::i32, Ret, ShiftAmt); - - // Eliminate the upper bits by setting them to ... - EVT MemEltVT = MemVT.getScalarType(); - - // ... ones. - if (ExtType == ISD::SEXTLOAD) { - SDValue MemEltVTNode = DAG.getValueType(MemEltVT); - - SDValue Ops[] = { - DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, Ret, MemEltVTNode), - Load->getChain() - }; - - return DAG.getMergeValues(Ops, DL); - } - - // ... or zeros. - SDValue Ops[] = { - DAG.getZeroExtendInReg(Ret, DL, MemEltVT), - Load->getChain() - }; - - return DAG.getMergeValues(Ops, DL); -} - -SDValue AMDGPUTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { - SDLoc DL(Op); - SDValue Result = AMDGPUTargetLowering::MergeVectorStore(Op, DAG); - if (Result.getNode()) { - return Result; - } - - StoreSDNode *Store = cast(Op); - SDValue Chain = Store->getChain(); - if ((Store->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS || - Store->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) && - Store->getValue().getValueType().isVector()) { - return ScalarizeVectorStore(Op, DAG); - } - - EVT MemVT = Store->getMemoryVT(); - if (Store->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS && - MemVT.bitsLT(MVT::i32)) { - unsigned Mask = 0; - if (Store->getMemoryVT() == MVT::i8) { - Mask = 0xff; - } else if (Store->getMemoryVT() == MVT::i16) { - Mask = 0xffff; - } - SDValue BasePtr = Store->getBasePtr(); - SDValue Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, BasePtr, - DAG.getConstant(2, DL, MVT::i32)); - SDValue Dst = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, MVT::i32, - Chain, Ptr, - DAG.getTargetConstant(0, DL, MVT::i32)); - - SDValue ByteIdx = DAG.getNode(ISD::AND, DL, MVT::i32, BasePtr, - DAG.getConstant(0x3, DL, MVT::i32)); - - SDValue ShiftAmt = DAG.getNode(ISD::SHL, DL, MVT::i32, ByteIdx, - DAG.getConstant(3, DL, MVT::i32)); - - SDValue SExtValue = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i32, - Store->getValue()); - - SDValue MaskedValue = DAG.getZeroExtendInReg(SExtValue, DL, MemVT); - - SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, MVT::i32, - MaskedValue, ShiftAmt); - - SDValue DstMask = DAG.getNode(ISD::SHL, DL, MVT::i32, - DAG.getConstant(Mask, DL, MVT::i32), - ShiftAmt); - DstMask = DAG.getNode(ISD::XOR, DL, MVT::i32, DstMask, - DAG.getConstant(0xffffffff, DL, MVT::i32)); - Dst = DAG.getNode(ISD::AND, DL, MVT::i32, Dst, DstMask); - - SDValue Value = DAG.getNode(ISD::OR, DL, MVT::i32, Dst, ShiftedValue); - return DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, - Chain, Value, Ptr, - DAG.getTargetConstant(0, DL, MVT::i32)); - } - return SDValue(); -} - -// This is a shortcut for integer division because we have fast i32<->f32 -// conversions, and fast f32 reciprocal instructions. The fractional part of a -// float is enough to accurately represent up to a 24-bit integer. -SDValue AMDGPUTargetLowering::LowerDIVREM24(SDValue Op, SelectionDAG &DAG, bool sign) const { - SDLoc DL(Op); - EVT VT = Op.getValueType(); - SDValue LHS = Op.getOperand(0); - SDValue RHS = Op.getOperand(1); - MVT IntVT = MVT::i32; - MVT FltVT = MVT::f32; - - ISD::NodeType ToFp = sign ? ISD::SINT_TO_FP : ISD::UINT_TO_FP; - ISD::NodeType ToInt = sign ? ISD::FP_TO_SINT : ISD::FP_TO_UINT; - - if (VT.isVector()) { - unsigned NElts = VT.getVectorNumElements(); - IntVT = MVT::getVectorVT(MVT::i32, NElts); - FltVT = MVT::getVectorVT(MVT::f32, NElts); - } - - unsigned BitSize = VT.getScalarType().getSizeInBits(); - - SDValue jq = DAG.getConstant(1, DL, IntVT); - - if (sign) { - // char|short jq = ia ^ ib; - jq = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS); - - // jq = jq >> (bitsize - 2) - jq = DAG.getNode(ISD::SRA, DL, VT, jq, - DAG.getConstant(BitSize - 2, DL, VT)); - - // jq = jq | 0x1 - jq = DAG.getNode(ISD::OR, DL, VT, jq, DAG.getConstant(1, DL, VT)); - - // jq = (int)jq - jq = DAG.getSExtOrTrunc(jq, DL, IntVT); - } - - // int ia = (int)LHS; - SDValue ia = sign ? - DAG.getSExtOrTrunc(LHS, DL, IntVT) : DAG.getZExtOrTrunc(LHS, DL, IntVT); - - // int ib, (int)RHS; - SDValue ib = sign ? - DAG.getSExtOrTrunc(RHS, DL, IntVT) : DAG.getZExtOrTrunc(RHS, DL, IntVT); - - // float fa = (float)ia; - SDValue fa = DAG.getNode(ToFp, DL, FltVT, ia); - - // float fb = (float)ib; - SDValue fb = DAG.getNode(ToFp, DL, FltVT, ib); - - // float fq = native_divide(fa, fb); - SDValue fq = DAG.getNode(ISD::FMUL, DL, FltVT, - fa, DAG.getNode(AMDGPUISD::RCP, DL, FltVT, fb)); - - // fq = trunc(fq); - fq = DAG.getNode(ISD::FTRUNC, DL, FltVT, fq); - - // float fqneg = -fq; - SDValue fqneg = DAG.getNode(ISD::FNEG, DL, FltVT, fq); - - // float fr = mad(fqneg, fb, fa); - SDValue fr = DAG.getNode(ISD::FADD, DL, FltVT, - DAG.getNode(ISD::FMUL, DL, FltVT, fqneg, fb), fa); - - // int iq = (int)fq; - SDValue iq = DAG.getNode(ToInt, DL, IntVT, fq); - - // fr = fabs(fr); - fr = DAG.getNode(ISD::FABS, DL, FltVT, fr); - - // fb = fabs(fb); - fb = DAG.getNode(ISD::FABS, DL, FltVT, fb); - - EVT SetCCVT = getSetCCResultType(*DAG.getContext(), VT); - - // int cv = fr >= fb; - SDValue cv = DAG.getSetCC(DL, SetCCVT, fr, fb, ISD::SETOGE); - - // jq = (cv ? jq : 0); - jq = DAG.getNode(ISD::SELECT, DL, VT, cv, jq, DAG.getConstant(0, DL, VT)); - - // dst = trunc/extend to legal type - iq = sign ? DAG.getSExtOrTrunc(iq, DL, VT) : DAG.getZExtOrTrunc(iq, DL, VT); - - // dst = iq + jq; - SDValue Div = DAG.getNode(ISD::ADD, DL, VT, iq, jq); - - // Rem needs compensation, it's easier to recompute it - SDValue Rem = DAG.getNode(ISD::MUL, DL, VT, Div, RHS); - Rem = DAG.getNode(ISD::SUB, DL, VT, LHS, Rem); - - SDValue Res[2] = { - Div, - Rem - }; - return DAG.getMergeValues(Res, DL); -} - -void AMDGPUTargetLowering::LowerUDIVREM64(SDValue Op, - SelectionDAG &DAG, - SmallVectorImpl &Results) const { - assert(Op.getValueType() == MVT::i64); - - SDLoc DL(Op); - EVT VT = Op.getValueType(); - EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext()); - - SDValue one = DAG.getConstant(1, DL, HalfVT); - SDValue zero = DAG.getConstant(0, DL, HalfVT); - - //HiLo split - SDValue LHS = Op.getOperand(0); - SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, zero); - SDValue LHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, one); - - SDValue RHS = Op.getOperand(1); - SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, zero); - SDValue RHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, one); - - if (VT == MVT::i64 && - DAG.MaskedValueIsZero(RHS, APInt::getHighBitsSet(64, 32)) && - DAG.MaskedValueIsZero(LHS, APInt::getHighBitsSet(64, 32))) { - - SDValue Res = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(HalfVT, HalfVT), - LHS_Lo, RHS_Lo); - - SDValue DIV = DAG.getNode(ISD::BUILD_PAIR, DL, VT, Res.getValue(0), zero); - SDValue REM = DAG.getNode(ISD::BUILD_PAIR, DL, VT, Res.getValue(1), zero); - Results.push_back(DIV); - Results.push_back(REM); - return; - } - - // Get Speculative values - SDValue DIV_Part = DAG.getNode(ISD::UDIV, DL, HalfVT, LHS_Hi, RHS_Lo); - SDValue REM_Part = DAG.getNode(ISD::UREM, DL, HalfVT, LHS_Hi, RHS_Lo); - - SDValue REM_Lo = DAG.getSelectCC(DL, RHS_Hi, zero, REM_Part, LHS_Hi, ISD::SETEQ); - SDValue REM = DAG.getNode(ISD::BUILD_PAIR, DL, VT, REM_Lo, zero); - - SDValue DIV_Hi = DAG.getSelectCC(DL, RHS_Hi, zero, DIV_Part, zero, ISD::SETEQ); - SDValue DIV_Lo = zero; - - const unsigned halfBitWidth = HalfVT.getSizeInBits(); - - for (unsigned i = 0; i < halfBitWidth; ++i) { - const unsigned bitPos = halfBitWidth - i - 1; - SDValue POS = DAG.getConstant(bitPos, DL, HalfVT); - // Get value of high bit - SDValue HBit = DAG.getNode(ISD::SRL, DL, HalfVT, LHS_Lo, POS); - HBit = DAG.getNode(ISD::AND, DL, HalfVT, HBit, one); - HBit = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, HBit); - - // Shift - REM = DAG.getNode(ISD::SHL, DL, VT, REM, DAG.getConstant(1, DL, VT)); - // Add LHS high bit - REM = DAG.getNode(ISD::OR, DL, VT, REM, HBit); - - SDValue BIT = DAG.getConstant(1 << bitPos, DL, HalfVT); - SDValue realBIT = DAG.getSelectCC(DL, REM, RHS, BIT, zero, ISD::SETUGE); - - DIV_Lo = DAG.getNode(ISD::OR, DL, HalfVT, DIV_Lo, realBIT); - - // Update REM - SDValue REM_sub = DAG.getNode(ISD::SUB, DL, VT, REM, RHS); - REM = DAG.getSelectCC(DL, REM, RHS, REM_sub, REM, ISD::SETUGE); - } - - SDValue DIV = DAG.getNode(ISD::BUILD_PAIR, DL, VT, DIV_Lo, DIV_Hi); - Results.push_back(DIV); - Results.push_back(REM); -} - -SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op, - SelectionDAG &DAG) const { - SDLoc DL(Op); - EVT VT = Op.getValueType(); - - if (VT == MVT::i64) { - SmallVector Results; - LowerUDIVREM64(Op, DAG, Results); - return DAG.getMergeValues(Results, DL); - } - - SDValue Num = Op.getOperand(0); - SDValue Den = Op.getOperand(1); - - if (VT == MVT::i32) { - if (DAG.MaskedValueIsZero(Num, APInt::getHighBitsSet(32, 8)) && - DAG.MaskedValueIsZero(Den, APInt::getHighBitsSet(32, 8))) { - // TODO: We technically could do this for i64, but shouldn't that just be - // handled by something generally reducing 64-bit division on 32-bit - // values to 32-bit? - return LowerDIVREM24(Op, DAG, false); - } - } - - // RCP = URECIP(Den) = 2^32 / Den + e - // e is rounding error. - SDValue RCP = DAG.getNode(AMDGPUISD::URECIP, DL, VT, Den); - - // RCP_LO = mul(RCP, Den) */ - SDValue RCP_LO = DAG.getNode(ISD::MUL, DL, VT, RCP, Den); - - // RCP_HI = mulhu (RCP, Den) */ - SDValue RCP_HI = DAG.getNode(ISD::MULHU, DL, VT, RCP, Den); - - // NEG_RCP_LO = -RCP_LO - SDValue NEG_RCP_LO = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), - RCP_LO); - - // ABS_RCP_LO = (RCP_HI == 0 ? NEG_RCP_LO : RCP_LO) - SDValue ABS_RCP_LO = DAG.getSelectCC(DL, RCP_HI, DAG.getConstant(0, DL, VT), - NEG_RCP_LO, RCP_LO, - ISD::SETEQ); - // Calculate the rounding error from the URECIP instruction - // E = mulhu(ABS_RCP_LO, RCP) - SDValue E = DAG.getNode(ISD::MULHU, DL, VT, ABS_RCP_LO, RCP); - - // RCP_A_E = RCP + E - SDValue RCP_A_E = DAG.getNode(ISD::ADD, DL, VT, RCP, E); - - // RCP_S_E = RCP - E - SDValue RCP_S_E = DAG.getNode(ISD::SUB, DL, VT, RCP, E); - - // Tmp0 = (RCP_HI == 0 ? RCP_A_E : RCP_SUB_E) - SDValue Tmp0 = DAG.getSelectCC(DL, RCP_HI, DAG.getConstant(0, DL, VT), - RCP_A_E, RCP_S_E, - ISD::SETEQ); - // Quotient = mulhu(Tmp0, Num) - SDValue Quotient = DAG.getNode(ISD::MULHU, DL, VT, Tmp0, Num); - - // Num_S_Remainder = Quotient * Den - SDValue Num_S_Remainder = DAG.getNode(ISD::MUL, DL, VT, Quotient, Den); - - // Remainder = Num - Num_S_Remainder - SDValue Remainder = DAG.getNode(ISD::SUB, DL, VT, Num, Num_S_Remainder); - - // Remainder_GE_Den = (Remainder >= Den ? -1 : 0) - SDValue Remainder_GE_Den = DAG.getSelectCC(DL, Remainder, Den, - DAG.getConstant(-1, DL, VT), - DAG.getConstant(0, DL, VT), - ISD::SETUGE); - // Remainder_GE_Zero = (Num >= Num_S_Remainder ? -1 : 0) - SDValue Remainder_GE_Zero = DAG.getSelectCC(DL, Num, - Num_S_Remainder, - DAG.getConstant(-1, DL, VT), - DAG.getConstant(0, DL, VT), - ISD::SETUGE); - // Tmp1 = Remainder_GE_Den & Remainder_GE_Zero - SDValue Tmp1 = DAG.getNode(ISD::AND, DL, VT, Remainder_GE_Den, - Remainder_GE_Zero); - - // Calculate Division result: - - // Quotient_A_One = Quotient + 1 - SDValue Quotient_A_One = DAG.getNode(ISD::ADD, DL, VT, Quotient, - DAG.getConstant(1, DL, VT)); - - // Quotient_S_One = Quotient - 1 - SDValue Quotient_S_One = DAG.getNode(ISD::SUB, DL, VT, Quotient, - DAG.getConstant(1, DL, VT)); - - // Div = (Tmp1 == 0 ? Quotient : Quotient_A_One) - SDValue Div = DAG.getSelectCC(DL, Tmp1, DAG.getConstant(0, DL, VT), - Quotient, Quotient_A_One, ISD::SETEQ); - - // Div = (Remainder_GE_Zero == 0 ? Quotient_S_One : Div) - Div = DAG.getSelectCC(DL, Remainder_GE_Zero, DAG.getConstant(0, DL, VT), - Quotient_S_One, Div, ISD::SETEQ); - - // Calculate Rem result: - - // Remainder_S_Den = Remainder - Den - SDValue Remainder_S_Den = DAG.getNode(ISD::SUB, DL, VT, Remainder, Den); - - // Remainder_A_Den = Remainder + Den - SDValue Remainder_A_Den = DAG.getNode(ISD::ADD, DL, VT, Remainder, Den); - - // Rem = (Tmp1 == 0 ? Remainder : Remainder_S_Den) - SDValue Rem = DAG.getSelectCC(DL, Tmp1, DAG.getConstant(0, DL, VT), - Remainder, Remainder_S_Den, ISD::SETEQ); - - // Rem = (Remainder_GE_Zero == 0 ? Remainder_A_Den : Rem) - Rem = DAG.getSelectCC(DL, Remainder_GE_Zero, DAG.getConstant(0, DL, VT), - Remainder_A_Den, Rem, ISD::SETEQ); - SDValue Ops[2] = { - Div, - Rem - }; - return DAG.getMergeValues(Ops, DL); -} - -SDValue AMDGPUTargetLowering::LowerSDIVREM(SDValue Op, - SelectionDAG &DAG) const { - SDLoc DL(Op); - EVT VT = Op.getValueType(); - - SDValue LHS = Op.getOperand(0); - SDValue RHS = Op.getOperand(1); - - SDValue Zero = DAG.getConstant(0, DL, VT); - SDValue NegOne = DAG.getConstant(-1, DL, VT); - - if (VT == MVT::i32 && - DAG.ComputeNumSignBits(LHS) > 8 && - DAG.ComputeNumSignBits(RHS) > 8) { - return LowerDIVREM24(Op, DAG, true); - } - if (VT == MVT::i64 && - DAG.ComputeNumSignBits(LHS) > 32 && - DAG.ComputeNumSignBits(RHS) > 32) { - EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext()); - - //HiLo split - SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, Zero); - SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, Zero); - SDValue DIVREM = DAG.getNode(ISD::SDIVREM, DL, DAG.getVTList(HalfVT, HalfVT), - LHS_Lo, RHS_Lo); - SDValue Res[2] = { - DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(0)), - DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(1)) - }; - return DAG.getMergeValues(Res, DL); - } - - SDValue LHSign = DAG.getSelectCC(DL, LHS, Zero, NegOne, Zero, ISD::SETLT); - SDValue RHSign = DAG.getSelectCC(DL, RHS, Zero, NegOne, Zero, ISD::SETLT); - SDValue DSign = DAG.getNode(ISD::XOR, DL, VT, LHSign, RHSign); - SDValue RSign = LHSign; // Remainder sign is the same as LHS - - LHS = DAG.getNode(ISD::ADD, DL, VT, LHS, LHSign); - RHS = DAG.getNode(ISD::ADD, DL, VT, RHS, RHSign); - - LHS = DAG.getNode(ISD::XOR, DL, VT, LHS, LHSign); - RHS = DAG.getNode(ISD::XOR, DL, VT, RHS, RHSign); - - SDValue Div = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT), LHS, RHS); - SDValue Rem = Div.getValue(1); - - Div = DAG.getNode(ISD::XOR, DL, VT, Div, DSign); - Rem = DAG.getNode(ISD::XOR, DL, VT, Rem, RSign); - - Div = DAG.getNode(ISD::SUB, DL, VT, Div, DSign); - Rem = DAG.getNode(ISD::SUB, DL, VT, Rem, RSign); - - SDValue Res[2] = { - Div, - Rem - }; - return DAG.getMergeValues(Res, DL); -} - -// (frem x, y) -> (fsub x, (fmul (ftrunc (fdiv x, y)), y)) -SDValue AMDGPUTargetLowering::LowerFREM(SDValue Op, SelectionDAG &DAG) const { - SDLoc SL(Op); - EVT VT = Op.getValueType(); - SDValue X = Op.getOperand(0); - SDValue Y = Op.getOperand(1); - - SDValue Div = DAG.getNode(ISD::FDIV, SL, VT, X, Y); - SDValue Floor = DAG.getNode(ISD::FTRUNC, SL, VT, Div); - SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, Floor, Y); - - return DAG.getNode(ISD::FSUB, SL, VT, X, Mul); -} - -SDValue AMDGPUTargetLowering::LowerFCEIL(SDValue Op, SelectionDAG &DAG) const { - SDLoc SL(Op); - SDValue Src = Op.getOperand(0); - - // result = trunc(src) - // if (src > 0.0 && src != result) - // result += 1.0 - - SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src); - - const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64); - const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64); - - EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::f64); - - SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOGT); - SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE); - SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc); - - SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, One, Zero); - return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add); -} - -static SDValue extractF64Exponent(SDValue Hi, SDLoc SL, SelectionDAG &DAG) { - const unsigned FractBits = 52; - const unsigned ExpBits = 11; - - SDValue ExpPart = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32, - Hi, - DAG.getConstant(FractBits - 32, SL, MVT::i32), - DAG.getConstant(ExpBits, SL, MVT::i32)); - SDValue Exp = DAG.getNode(ISD::SUB, SL, MVT::i32, ExpPart, - DAG.getConstant(1023, SL, MVT::i32)); - - return Exp; -} - -SDValue AMDGPUTargetLowering::LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const { - SDLoc SL(Op); - SDValue Src = Op.getOperand(0); - - assert(Op.getValueType() == MVT::f64); - - const SDValue Zero = DAG.getConstant(0, SL, MVT::i32); - const SDValue One = DAG.getConstant(1, SL, MVT::i32); - - SDValue VecSrc = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Src); - - // Extract the upper half, since this is where we will find the sign and - // exponent. - SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, VecSrc, One); - - SDValue Exp = extractF64Exponent(Hi, SL, DAG); - - const unsigned FractBits = 52; - - // Extract the sign bit. - const SDValue SignBitMask = DAG.getConstant(UINT32_C(1) << 31, SL, MVT::i32); - SDValue SignBit = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, SignBitMask); - - // Extend back to to 64-bits. - SDValue SignBit64 = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, - Zero, SignBit); - SignBit64 = DAG.getNode(ISD::BITCAST, SL, MVT::i64, SignBit64); - - SDValue BcInt = DAG.getNode(ISD::BITCAST, SL, MVT::i64, Src); - const SDValue FractMask - = DAG.getConstant((UINT64_C(1) << FractBits) - 1, SL, MVT::i64); - - SDValue Shr = DAG.getNode(ISD::SRA, SL, MVT::i64, FractMask, Exp); - SDValue Not = DAG.getNOT(SL, Shr, MVT::i64); - SDValue Tmp0 = DAG.getNode(ISD::AND, SL, MVT::i64, BcInt, Not); - - EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::i32); - - const SDValue FiftyOne = DAG.getConstant(FractBits - 1, SL, MVT::i32); - - SDValue ExpLt0 = DAG.getSetCC(SL, SetCCVT, Exp, Zero, ISD::SETLT); - SDValue ExpGt51 = DAG.getSetCC(SL, SetCCVT, Exp, FiftyOne, ISD::SETGT); - - SDValue Tmp1 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpLt0, SignBit64, Tmp0); - SDValue Tmp2 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpGt51, BcInt, Tmp1); - - return DAG.getNode(ISD::BITCAST, SL, MVT::f64, Tmp2); -} - -SDValue AMDGPUTargetLowering::LowerFRINT(SDValue Op, SelectionDAG &DAG) const { - SDLoc SL(Op); - SDValue Src = Op.getOperand(0); - - assert(Op.getValueType() == MVT::f64); - - APFloat C1Val(APFloat::IEEEdouble, "0x1.0p+52"); - SDValue C1 = DAG.getConstantFP(C1Val, SL, MVT::f64); - SDValue CopySign = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, C1, Src); - - SDValue Tmp1 = DAG.getNode(ISD::FADD, SL, MVT::f64, Src, CopySign); - SDValue Tmp2 = DAG.getNode(ISD::FSUB, SL, MVT::f64, Tmp1, CopySign); - - SDValue Fabs = DAG.getNode(ISD::FABS, SL, MVT::f64, Src); - - APFloat C2Val(APFloat::IEEEdouble, "0x1.fffffffffffffp+51"); - SDValue C2 = DAG.getConstantFP(C2Val, SL, MVT::f64); - - EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::f64); - SDValue Cond = DAG.getSetCC(SL, SetCCVT, Fabs, C2, ISD::SETOGT); - - return DAG.getSelect(SL, MVT::f64, Cond, Src, Tmp2); -} - -SDValue AMDGPUTargetLowering::LowerFNEARBYINT(SDValue Op, SelectionDAG &DAG) const { - // FNEARBYINT and FRINT are the same, except in their handling of FP - // exceptions. Those aren't really meaningful for us, and OpenCL only has - // rint, so just treat them as equivalent. - return DAG.getNode(ISD::FRINT, SDLoc(Op), Op.getValueType(), Op.getOperand(0)); -} - -// XXX - May require not supporting f32 denormals? -SDValue AMDGPUTargetLowering::LowerFROUND32(SDValue Op, SelectionDAG &DAG) const { - SDLoc SL(Op); - SDValue X = Op.getOperand(0); - - SDValue T = DAG.getNode(ISD::FTRUNC, SL, MVT::f32, X); - - SDValue Diff = DAG.getNode(ISD::FSUB, SL, MVT::f32, X, T); - - SDValue AbsDiff = DAG.getNode(ISD::FABS, SL, MVT::f32, Diff); - - const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f32); - const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32); - const SDValue Half = DAG.getConstantFP(0.5, SL, MVT::f32); - - SDValue SignOne = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f32, One, X); - - EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::f32); - - SDValue Cmp = DAG.getSetCC(SL, SetCCVT, AbsDiff, Half, ISD::SETOGE); - - SDValue Sel = DAG.getNode(ISD::SELECT, SL, MVT::f32, Cmp, SignOne, Zero); - - return DAG.getNode(ISD::FADD, SL, MVT::f32, T, Sel); -} - -SDValue AMDGPUTargetLowering::LowerFROUND64(SDValue Op, SelectionDAG &DAG) const { - SDLoc SL(Op); - SDValue X = Op.getOperand(0); - - SDValue L = DAG.getNode(ISD::BITCAST, SL, MVT::i64, X); - - const SDValue Zero = DAG.getConstant(0, SL, MVT::i32); - const SDValue One = DAG.getConstant(1, SL, MVT::i32); - const SDValue NegOne = DAG.getConstant(-1, SL, MVT::i32); - const SDValue FiftyOne = DAG.getConstant(51, SL, MVT::i32); - EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::i32); - - - SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X); - - SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BC, One); - - SDValue Exp = extractF64Exponent(Hi, SL, DAG); - - const SDValue Mask = DAG.getConstant(INT64_C(0x000fffffffffffff), SL, - MVT::i64); - - SDValue M = DAG.getNode(ISD::SRA, SL, MVT::i64, Mask, Exp); - SDValue D = DAG.getNode(ISD::SRA, SL, MVT::i64, - DAG.getConstant(INT64_C(0x0008000000000000), SL, - MVT::i64), - Exp); - - SDValue Tmp0 = DAG.getNode(ISD::AND, SL, MVT::i64, L, M); - SDValue Tmp1 = DAG.getSetCC(SL, SetCCVT, - DAG.getConstant(0, SL, MVT::i64), Tmp0, - ISD::SETNE); - - SDValue Tmp2 = DAG.getNode(ISD::SELECT, SL, MVT::i64, Tmp1, - D, DAG.getConstant(0, SL, MVT::i64)); - SDValue K = DAG.getNode(ISD::ADD, SL, MVT::i64, L, Tmp2); - - K = DAG.getNode(ISD::AND, SL, MVT::i64, K, DAG.getNOT(SL, M, MVT::i64)); - K = DAG.getNode(ISD::BITCAST, SL, MVT::f64, K); - - SDValue ExpLt0 = DAG.getSetCC(SL, SetCCVT, Exp, Zero, ISD::SETLT); - SDValue ExpGt51 = DAG.getSetCC(SL, SetCCVT, Exp, FiftyOne, ISD::SETGT); - SDValue ExpEqNegOne = DAG.getSetCC(SL, SetCCVT, NegOne, Exp, ISD::SETEQ); - - SDValue Mag = DAG.getNode(ISD::SELECT, SL, MVT::f64, - ExpEqNegOne, - DAG.getConstantFP(1.0, SL, MVT::f64), - DAG.getConstantFP(0.0, SL, MVT::f64)); - - SDValue S = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, Mag, X); - - K = DAG.getNode(ISD::SELECT, SL, MVT::f64, ExpLt0, S, K); - K = DAG.getNode(ISD::SELECT, SL, MVT::f64, ExpGt51, X, K); - - return K; -} - -SDValue AMDGPUTargetLowering::LowerFROUND(SDValue Op, SelectionDAG &DAG) const { - EVT VT = Op.getValueType(); - - if (VT == MVT::f32) - return LowerFROUND32(Op, DAG); - - if (VT == MVT::f64) - return LowerFROUND64(Op, DAG); - - llvm_unreachable("unhandled type"); -} - -SDValue AMDGPUTargetLowering::LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const { - SDLoc SL(Op); - SDValue Src = Op.getOperand(0); - - // result = trunc(src); - // if (src < 0.0 && src != result) - // result += -1.0. - - SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src); - - const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64); - const SDValue NegOne = DAG.getConstantFP(-1.0, SL, MVT::f64); - - EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::f64); - - SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOLT); - SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE); - SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc); - - SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, NegOne, Zero); - return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add); -} - -SDValue AMDGPUTargetLowering::LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG, - bool Signed) const { - SDLoc SL(Op); - SDValue Src = Op.getOperand(0); - - SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Src); - - SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BC, - DAG.getConstant(0, SL, MVT::i32)); - SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BC, - DAG.getConstant(1, SL, MVT::i32)); - - SDValue CvtHi = DAG.getNode(Signed ? ISD::SINT_TO_FP : ISD::UINT_TO_FP, - SL, MVT::f64, Hi); - - SDValue CvtLo = DAG.getNode(ISD::UINT_TO_FP, SL, MVT::f64, Lo); - - SDValue LdExp = DAG.getNode(AMDGPUISD::LDEXP, SL, MVT::f64, CvtHi, - DAG.getConstant(32, SL, MVT::i32)); - - return DAG.getNode(ISD::FADD, SL, MVT::f64, LdExp, CvtLo); -} - -SDValue AMDGPUTargetLowering::LowerUINT_TO_FP(SDValue Op, - SelectionDAG &DAG) const { - SDValue S0 = Op.getOperand(0); - if (S0.getValueType() != MVT::i64) - return SDValue(); - - EVT DestVT = Op.getValueType(); - if (DestVT == MVT::f64) - return LowerINT_TO_FP64(Op, DAG, false); - - assert(DestVT == MVT::f32); - - SDLoc DL(Op); - - // f32 uint_to_fp i64 - SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, S0, - DAG.getConstant(0, DL, MVT::i32)); - SDValue FloatLo = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, Lo); - SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, S0, - DAG.getConstant(1, DL, MVT::i32)); - SDValue FloatHi = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, Hi); - FloatHi = DAG.getNode(ISD::FMUL, DL, MVT::f32, FloatHi, - DAG.getConstantFP(4294967296.0f, DL, MVT::f32)); // 2^32 - return DAG.getNode(ISD::FADD, DL, MVT::f32, FloatLo, FloatHi); -} - -SDValue AMDGPUTargetLowering::LowerSINT_TO_FP(SDValue Op, - SelectionDAG &DAG) const { - SDValue Src = Op.getOperand(0); - if (Src.getValueType() == MVT::i64 && Op.getValueType() == MVT::f64) - return LowerINT_TO_FP64(Op, DAG, true); - - return SDValue(); -} - -SDValue AMDGPUTargetLowering::LowerFP64_TO_INT(SDValue Op, SelectionDAG &DAG, - bool Signed) const { - SDLoc SL(Op); - - SDValue Src = Op.getOperand(0); - - SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src); - - SDValue K0 = DAG.getConstantFP(BitsToDouble(UINT64_C(0x3df0000000000000)), SL, - MVT::f64); - SDValue K1 = DAG.getConstantFP(BitsToDouble(UINT64_C(0xc1f0000000000000)), SL, - MVT::f64); - - SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, Trunc, K0); - - SDValue FloorMul = DAG.getNode(ISD::FFLOOR, SL, MVT::f64, Mul); - - - SDValue Fma = DAG.getNode(ISD::FMA, SL, MVT::f64, FloorMul, K1, Trunc); - - SDValue Hi = DAG.getNode(Signed ? ISD::FP_TO_SINT : ISD::FP_TO_UINT, SL, - MVT::i32, FloorMul); - SDValue Lo = DAG.getNode(ISD::FP_TO_UINT, SL, MVT::i32, Fma); - - SDValue Result = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Lo, Hi); - - return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Result); -} - -SDValue AMDGPUTargetLowering::LowerFP_TO_SINT(SDValue Op, - SelectionDAG &DAG) const { - SDValue Src = Op.getOperand(0); - - if (Op.getValueType() == MVT::i64 && Src.getValueType() == MVT::f64) - return LowerFP64_TO_INT(Op, DAG, true); - - return SDValue(); -} - -SDValue AMDGPUTargetLowering::LowerFP_TO_UINT(SDValue Op, - SelectionDAG &DAG) const { - SDValue Src = Op.getOperand(0); - - if (Op.getValueType() == MVT::i64 && Src.getValueType() == MVT::f64) - return LowerFP64_TO_INT(Op, DAG, false); - - return SDValue(); -} - -SDValue AMDGPUTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op, - SelectionDAG &DAG) const { - EVT ExtraVT = cast(Op.getOperand(1))->getVT(); - MVT VT = Op.getSimpleValueType(); - MVT ScalarVT = VT.getScalarType(); - - if (!VT.isVector()) - return SDValue(); - - SDValue Src = Op.getOperand(0); - SDLoc DL(Op); - - // TODO: Don't scalarize on Evergreen? - unsigned NElts = VT.getVectorNumElements(); - SmallVector Args; - DAG.ExtractVectorElements(Src, Args, 0, NElts); - - SDValue VTOp = DAG.getValueType(ExtraVT.getScalarType()); - for (unsigned I = 0; I < NElts; ++I) - Args[I] = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, ScalarVT, Args[I], VTOp); - - return DAG.getNode(ISD::BUILD_VECTOR, DL, VT, Args); -} - -//===----------------------------------------------------------------------===// -// Custom DAG optimizations -//===----------------------------------------------------------------------===// - -static bool isU24(SDValue Op, SelectionDAG &DAG) { - APInt KnownZero, KnownOne; - EVT VT = Op.getValueType(); - DAG.computeKnownBits(Op, KnownZero, KnownOne); - - return (VT.getSizeInBits() - KnownZero.countLeadingOnes()) <= 24; -} - -static bool isI24(SDValue Op, SelectionDAG &DAG) { - EVT VT = Op.getValueType(); - - // In order for this to be a signed 24-bit value, bit 23, must - // be a sign bit. - return VT.getSizeInBits() >= 24 && // Types less than 24-bit should be treated - // as unsigned 24-bit values. - (VT.getSizeInBits() - DAG.ComputeNumSignBits(Op)) < 24; -} - -static void simplifyI24(SDValue Op, TargetLowering::DAGCombinerInfo &DCI) { - - SelectionDAG &DAG = DCI.DAG; - const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - EVT VT = Op.getValueType(); - - APInt Demanded = APInt::getLowBitsSet(VT.getSizeInBits(), 24); - APInt KnownZero, KnownOne; - TargetLowering::TargetLoweringOpt TLO(DAG, true, true); - if (TLI.SimplifyDemandedBits(Op, Demanded, KnownZero, KnownOne, TLO)) - DCI.CommitTargetLoweringOpt(TLO); -} - -template -static SDValue constantFoldBFE(SelectionDAG &DAG, IntTy Src0, - uint32_t Offset, uint32_t Width, SDLoc DL) { - if (Width + Offset < 32) { - uint32_t Shl = static_cast(Src0) << (32 - Offset - Width); - IntTy Result = static_cast(Shl) >> (32 - Width); - return DAG.getConstant(Result, DL, MVT::i32); - } - - return DAG.getConstant(Src0 >> Offset, DL, MVT::i32); -} - -static bool usesAllNormalStores(SDNode *LoadVal) { - for (SDNode::use_iterator I = LoadVal->use_begin(); !I.atEnd(); ++I) { - if (!ISD::isNormalStore(*I)) - return false; - } - - return true; -} - -// If we have a copy of an illegal type, replace it with a load / store of an -// equivalently sized legal type. This avoids intermediate bit pack / unpack -// instructions emitted when handling extloads and truncstores. Ideally we could -// recognize the pack / unpack pattern to eliminate it. -SDValue AMDGPUTargetLowering::performStoreCombine(SDNode *N, - DAGCombinerInfo &DCI) const { - if (!DCI.isBeforeLegalize()) - return SDValue(); - - StoreSDNode *SN = cast(N); - SDValue Value = SN->getValue(); - EVT VT = Value.getValueType(); - - if (isTypeLegal(VT) || SN->isVolatile() || - !ISD::isNormalLoad(Value.getNode()) || VT.getSizeInBits() < 8) - return SDValue(); - - LoadSDNode *LoadVal = cast(Value); - if (LoadVal->isVolatile() || !usesAllNormalStores(LoadVal)) - return SDValue(); - - EVT MemVT = LoadVal->getMemoryVT(); - - SDLoc SL(N); - SelectionDAG &DAG = DCI.DAG; - EVT LoadVT = getEquivalentMemType(*DAG.getContext(), MemVT); - - SDValue NewLoad = DAG.getLoad(ISD::UNINDEXED, ISD::NON_EXTLOAD, - LoadVT, SL, - LoadVal->getChain(), - LoadVal->getBasePtr(), - LoadVal->getOffset(), - LoadVT, - LoadVal->getMemOperand()); - - SDValue CastLoad = DAG.getNode(ISD::BITCAST, SL, VT, NewLoad.getValue(0)); - DCI.CombineTo(LoadVal, CastLoad, NewLoad.getValue(1), false); - - return DAG.getStore(SN->getChain(), SL, NewLoad, - SN->getBasePtr(), SN->getMemOperand()); -} - -SDValue AMDGPUTargetLowering::performMulCombine(SDNode *N, - DAGCombinerInfo &DCI) const { - EVT VT = N->getValueType(0); - - if (VT.isVector() || VT.getSizeInBits() > 32) - return SDValue(); - - SelectionDAG &DAG = DCI.DAG; - SDLoc DL(N); - - SDValue N0 = N->getOperand(0); - SDValue N1 = N->getOperand(1); - SDValue Mul; - - if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) { - N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32); - N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32); - Mul = DAG.getNode(AMDGPUISD::MUL_U24, DL, MVT::i32, N0, N1); - } else if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) { - N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32); - N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32); - Mul = DAG.getNode(AMDGPUISD::MUL_I24, DL, MVT::i32, N0, N1); - } else { - return SDValue(); - } - - // We need to use sext even for MUL_U24, because MUL_U24 is used - // for signed multiply of 8 and 16-bit types. - return DAG.getSExtOrTrunc(Mul, DL, VT); -} - -SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N, - DAGCombinerInfo &DCI) const { - SelectionDAG &DAG = DCI.DAG; - SDLoc DL(N); - - switch(N->getOpcode()) { - default: break; - case ISD::MUL: - return performMulCombine(N, DCI); - case AMDGPUISD::MUL_I24: - case AMDGPUISD::MUL_U24: { - SDValue N0 = N->getOperand(0); - SDValue N1 = N->getOperand(1); - simplifyI24(N0, DCI); - simplifyI24(N1, DCI); - return SDValue(); - } - case ISD::SELECT: { - SDValue Cond = N->getOperand(0); - if (Cond.getOpcode() == ISD::SETCC && Cond.hasOneUse()) { - EVT VT = N->getValueType(0); - SDValue LHS = Cond.getOperand(0); - SDValue RHS = Cond.getOperand(1); - SDValue CC = Cond.getOperand(2); - - SDValue True = N->getOperand(1); - SDValue False = N->getOperand(2); - - if (VT == MVT::f32) - return CombineFMinMaxLegacy(DL, VT, LHS, RHS, True, False, CC, DCI); - - // TODO: Implement min / max Evergreen instructions. - if (VT == MVT::i32 && - Subtarget->getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) { - return CombineIMinMax(DL, VT, LHS, RHS, True, False, CC, DAG); - } - } - - break; - } - case AMDGPUISD::BFE_I32: - case AMDGPUISD::BFE_U32: { - assert(!N->getValueType(0).isVector() && - "Vector handling of BFE not implemented"); - ConstantSDNode *Width = dyn_cast(N->getOperand(2)); - if (!Width) - break; - - uint32_t WidthVal = Width->getZExtValue() & 0x1f; - if (WidthVal == 0) - return DAG.getConstant(0, DL, MVT::i32); - - ConstantSDNode *Offset = dyn_cast(N->getOperand(1)); - if (!Offset) - break; - - SDValue BitsFrom = N->getOperand(0); - uint32_t OffsetVal = Offset->getZExtValue() & 0x1f; - - bool Signed = N->getOpcode() == AMDGPUISD::BFE_I32; - - if (OffsetVal == 0) { - // This is already sign / zero extended, so try to fold away extra BFEs. - unsigned SignBits = Signed ? (32 - WidthVal + 1) : (32 - WidthVal); - - unsigned OpSignBits = DAG.ComputeNumSignBits(BitsFrom); - if (OpSignBits >= SignBits) - return BitsFrom; - - EVT SmallVT = EVT::getIntegerVT(*DAG.getContext(), WidthVal); - if (Signed) { - // This is a sign_extend_inreg. Replace it to take advantage of existing - // DAG Combines. If not eliminated, we will match back to BFE during - // selection. - - // TODO: The sext_inreg of extended types ends, although we can could - // handle them in a single BFE. - return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, BitsFrom, - DAG.getValueType(SmallVT)); - } - - return DAG.getZeroExtendInReg(BitsFrom, DL, SmallVT); - } - - if (ConstantSDNode *CVal = dyn_cast(BitsFrom)) { - if (Signed) { - return constantFoldBFE(DAG, - CVal->getSExtValue(), - OffsetVal, - WidthVal, - DL); - } - - return constantFoldBFE(DAG, - CVal->getZExtValue(), - OffsetVal, - WidthVal, - DL); - } - - if ((OffsetVal + WidthVal) >= 32) { - SDValue ShiftVal = DAG.getConstant(OffsetVal, DL, MVT::i32); - return DAG.getNode(Signed ? ISD::SRA : ISD::SRL, DL, MVT::i32, - BitsFrom, ShiftVal); - } - - if (BitsFrom.hasOneUse()) { - APInt Demanded = APInt::getBitsSet(32, - OffsetVal, - OffsetVal + WidthVal); - - APInt KnownZero, KnownOne; - TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(), - !DCI.isBeforeLegalizeOps()); - const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - if (TLO.ShrinkDemandedConstant(BitsFrom, Demanded) || - TLI.SimplifyDemandedBits(BitsFrom, Demanded, - KnownZero, KnownOne, TLO)) { - DCI.CommitTargetLoweringOpt(TLO); - } - } - - break; - } - - case ISD::STORE: - return performStoreCombine(N, DCI); - } - return SDValue(); -} - -//===----------------------------------------------------------------------===// -// Helper functions -//===----------------------------------------------------------------------===// - -void AMDGPUTargetLowering::getOriginalFunctionArgs( - SelectionDAG &DAG, - const Function *F, - const SmallVectorImpl &Ins, - SmallVectorImpl &OrigIns) const { - - for (unsigned i = 0, e = Ins.size(); i < e; ++i) { - if (Ins[i].ArgVT == Ins[i].VT) { - OrigIns.push_back(Ins[i]); - continue; - } - - EVT VT; - if (Ins[i].ArgVT.isVector() && !Ins[i].VT.isVector()) { - // Vector has been split into scalars. - VT = Ins[i].ArgVT.getVectorElementType(); - } else if (Ins[i].VT.isVector() && Ins[i].ArgVT.isVector() && - Ins[i].ArgVT.getVectorElementType() != - Ins[i].VT.getVectorElementType()) { - // Vector elements have been promoted - VT = Ins[i].ArgVT; - } else { - // Vector has been spilt into smaller vectors. - VT = Ins[i].VT; - } - - ISD::InputArg Arg(Ins[i].Flags, VT, VT, Ins[i].Used, - Ins[i].OrigArgIndex, Ins[i].PartOffset); - OrigIns.push_back(Arg); - } -} - -bool AMDGPUTargetLowering::isHWTrueValue(SDValue Op) const { - if (ConstantFPSDNode * CFP = dyn_cast(Op)) { - return CFP->isExactlyValue(1.0); - } - if (ConstantSDNode *C = dyn_cast(Op)) { - return C->isAllOnesValue(); - } - return false; -} - -bool AMDGPUTargetLowering::isHWFalseValue(SDValue Op) const { - if (ConstantFPSDNode * CFP = dyn_cast(Op)) { - return CFP->getValueAPF().isZero(); - } - if (ConstantSDNode *C = dyn_cast(Op)) { - return C->isNullValue(); - } - return false; -} - -SDValue AMDGPUTargetLowering::CreateLiveInRegister(SelectionDAG &DAG, - const TargetRegisterClass *RC, - unsigned Reg, EVT VT) const { - MachineFunction &MF = DAG.getMachineFunction(); - MachineRegisterInfo &MRI = MF.getRegInfo(); - unsigned VirtualRegister; - if (!MRI.isLiveIn(Reg)) { - VirtualRegister = MRI.createVirtualRegister(RC); - MRI.addLiveIn(Reg, VirtualRegister); - } else { - VirtualRegister = MRI.getLiveInVirtReg(Reg); - } - return DAG.getRegister(VirtualRegister, VT); -} - -#define NODE_NAME_CASE(node) case AMDGPUISD::node: return #node; - -const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { - switch ((AMDGPUISD::NodeType)Opcode) { - case AMDGPUISD::FIRST_NUMBER: break; - // AMDIL DAG nodes - NODE_NAME_CASE(CALL); - NODE_NAME_CASE(UMUL); - NODE_NAME_CASE(RET_FLAG); - NODE_NAME_CASE(BRANCH_COND); - - // AMDGPU DAG nodes - NODE_NAME_CASE(DWORDADDR) - NODE_NAME_CASE(FRACT) - NODE_NAME_CASE(CLAMP) - NODE_NAME_CASE(COS_HW) - NODE_NAME_CASE(SIN_HW) - NODE_NAME_CASE(FMAX_LEGACY) - NODE_NAME_CASE(FMIN_LEGACY) - NODE_NAME_CASE(FMAX3) - NODE_NAME_CASE(SMAX3) - NODE_NAME_CASE(UMAX3) - NODE_NAME_CASE(FMIN3) - NODE_NAME_CASE(SMIN3) - NODE_NAME_CASE(UMIN3) - NODE_NAME_CASE(URECIP) - NODE_NAME_CASE(DIV_SCALE) - NODE_NAME_CASE(DIV_FMAS) - NODE_NAME_CASE(DIV_FIXUP) - NODE_NAME_CASE(TRIG_PREOP) - NODE_NAME_CASE(RCP) - NODE_NAME_CASE(RSQ) - NODE_NAME_CASE(RSQ_LEGACY) - NODE_NAME_CASE(RSQ_CLAMPED) - NODE_NAME_CASE(LDEXP) - NODE_NAME_CASE(FP_CLASS) - NODE_NAME_CASE(DOT4) - NODE_NAME_CASE(CARRY) - NODE_NAME_CASE(BORROW) - NODE_NAME_CASE(BFE_U32) - NODE_NAME_CASE(BFE_I32) - NODE_NAME_CASE(BFI) - NODE_NAME_CASE(BFM) - NODE_NAME_CASE(BREV) - NODE_NAME_CASE(MUL_U24) - NODE_NAME_CASE(MUL_I24) - NODE_NAME_CASE(MAD_U24) - NODE_NAME_CASE(MAD_I24) - NODE_NAME_CASE(TEXTURE_FETCH) - NODE_NAME_CASE(EXPORT) - NODE_NAME_CASE(CONST_ADDRESS) - NODE_NAME_CASE(REGISTER_LOAD) - NODE_NAME_CASE(REGISTER_STORE) - NODE_NAME_CASE(LOAD_CONSTANT) - NODE_NAME_CASE(LOAD_INPUT) - NODE_NAME_CASE(SAMPLE) - NODE_NAME_CASE(SAMPLEB) - NODE_NAME_CASE(SAMPLED) - NODE_NAME_CASE(SAMPLEL) - NODE_NAME_CASE(CVT_F32_UBYTE0) - NODE_NAME_CASE(CVT_F32_UBYTE1) - NODE_NAME_CASE(CVT_F32_UBYTE2) - NODE_NAME_CASE(CVT_F32_UBYTE3) - NODE_NAME_CASE(BUILD_VERTICAL_VECTOR) - NODE_NAME_CASE(CONST_DATA_PTR) - case AMDGPUISD::FIRST_MEM_OPCODE_NUMBER: break; - NODE_NAME_CASE(SENDMSG) - NODE_NAME_CASE(INTERP_MOV) - NODE_NAME_CASE(INTERP_P1) - NODE_NAME_CASE(INTERP_P2) - NODE_NAME_CASE(STORE_MSKOR) - NODE_NAME_CASE(TBUFFER_STORE_FORMAT) - case AMDGPUISD::LAST_AMDGPU_ISD_NUMBER: break; - } - return nullptr; -} - -SDValue AMDGPUTargetLowering::getRsqrtEstimate(SDValue Operand, - DAGCombinerInfo &DCI, - unsigned &RefinementSteps, - bool &UseOneConstNR) const { - SelectionDAG &DAG = DCI.DAG; - EVT VT = Operand.getValueType(); - - if (VT == MVT::f32) { - RefinementSteps = 0; - return DAG.getNode(AMDGPUISD::RSQ, SDLoc(Operand), VT, Operand); - } - - // TODO: There is also f64 rsq instruction, but the documentation is less - // clear on its precision. - - return SDValue(); -} - -SDValue AMDGPUTargetLowering::getRecipEstimate(SDValue Operand, - DAGCombinerInfo &DCI, - unsigned &RefinementSteps) const { - SelectionDAG &DAG = DCI.DAG; - EVT VT = Operand.getValueType(); - - if (VT == MVT::f32) { - // Reciprocal, < 1 ulp error. - // - // This reciprocal approximation converges to < 0.5 ulp error with one - // newton rhapson performed with two fused multiple adds (FMAs). - - RefinementSteps = 0; - return DAG.getNode(AMDGPUISD::RCP, SDLoc(Operand), VT, Operand); - } - - // TODO: There is also f64 rcp instruction, but the documentation is less - // clear on its precision. - - return SDValue(); -} - -static void computeKnownBitsForMinMax(const SDValue Op0, - const SDValue Op1, - APInt &KnownZero, - APInt &KnownOne, - const SelectionDAG &DAG, - unsigned Depth) { - APInt Op0Zero, Op0One; - APInt Op1Zero, Op1One; - DAG.computeKnownBits(Op0, Op0Zero, Op0One, Depth); - DAG.computeKnownBits(Op1, Op1Zero, Op1One, Depth); - - KnownZero = Op0Zero & Op1Zero; - KnownOne = Op0One & Op1One; -} - -void AMDGPUTargetLowering::computeKnownBitsForTargetNode( - const SDValue Op, - APInt &KnownZero, - APInt &KnownOne, - const SelectionDAG &DAG, - unsigned Depth) const { - - KnownZero = KnownOne = APInt(KnownOne.getBitWidth(), 0); // Don't know anything. - - APInt KnownZero2; - APInt KnownOne2; - unsigned Opc = Op.getOpcode(); - - switch (Opc) { - default: - break; - case ISD::INTRINSIC_WO_CHAIN: { - // FIXME: The intrinsic should just use the node. - switch (cast(Op.getOperand(0))->getZExtValue()) { - case AMDGPUIntrinsic::AMDGPU_imax: - case AMDGPUIntrinsic::AMDGPU_umax: - case AMDGPUIntrinsic::AMDGPU_imin: - case AMDGPUIntrinsic::AMDGPU_umin: - computeKnownBitsForMinMax(Op.getOperand(1), Op.getOperand(2), - KnownZero, KnownOne, DAG, Depth); - break; - default: - break; - } - - break; - } - case AMDGPUISD::CARRY: - case AMDGPUISD::BORROW: { - KnownZero = APInt::getHighBitsSet(32, 31); - break; - } - - case AMDGPUISD::BFE_I32: - case AMDGPUISD::BFE_U32: { - ConstantSDNode *CWidth = dyn_cast(Op.getOperand(2)); - if (!CWidth) - return; - - unsigned BitWidth = 32; - uint32_t Width = CWidth->getZExtValue() & 0x1f; - - if (Opc == AMDGPUISD::BFE_U32) - KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - Width); - - break; - } - } -} - -unsigned AMDGPUTargetLowering::ComputeNumSignBitsForTargetNode( - SDValue Op, - const SelectionDAG &DAG, - unsigned Depth) const { - switch (Op.getOpcode()) { - case AMDGPUISD::BFE_I32: { - ConstantSDNode *Width = dyn_cast(Op.getOperand(2)); - if (!Width) - return 1; - - unsigned SignBits = 32 - Width->getZExtValue() + 1; - ConstantSDNode *Offset = dyn_cast(Op.getOperand(1)); - if (!Offset || !Offset->isNullValue()) - return SignBits; - - // TODO: Could probably figure something out with non-0 offsets. - unsigned Op0SignBits = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1); - return std::max(SignBits, Op0SignBits); - } - - case AMDGPUISD::BFE_U32: { - ConstantSDNode *Width = dyn_cast(Op.getOperand(2)); - return Width ? 32 - (Width->getZExtValue() & 0x1f) : 1; - } - - case AMDGPUISD::CARRY: - case AMDGPUISD::BORROW: - return 31; - - default: - return 1; - } -} Index: projects/clang370-import/contrib/llvm/lib/Target/R600/R600ClauseMergePass.cpp =================================================================== --- projects/clang370-import/contrib/llvm/lib/Target/R600/R600ClauseMergePass.cpp (revision 288077) +++ projects/clang370-import/contrib/llvm/lib/Target/R600/R600ClauseMergePass.cpp (nonexistent) @@ -1,206 +0,0 @@ -//===-- R600ClauseMergePass - Merge consecutive CF_ALU -------------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -/// \file -/// R600EmitClauseMarker pass emits CFAlu instruction in a conservative maneer. -/// This pass is merging consecutive CFAlus where applicable. -/// It needs to be called after IfCvt for best results. -//===----------------------------------------------------------------------===// - -#include "AMDGPU.h" -#include "AMDGPUSubtarget.h" -#include "R600Defines.h" -#include "R600InstrInfo.h" -#include "R600MachineFunctionInfo.h" -#include "R600RegisterInfo.h" -#include "llvm/CodeGen/MachineFunctionPass.h" -#include "llvm/CodeGen/MachineInstrBuilder.h" -#include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/Support/Debug.h" -#include "llvm/Support/raw_ostream.h" - -using namespace llvm; - -#define DEBUG_TYPE "r600mergeclause" - -namespace { - -static bool isCFAlu(const MachineInstr *MI) { - switch (MI->getOpcode()) { - case AMDGPU::CF_ALU: - case AMDGPU::CF_ALU_PUSH_BEFORE: - return true; - default: - return false; - } -} - -class R600ClauseMergePass : public MachineFunctionPass { - -private: - static char ID; - const R600InstrInfo *TII; - - unsigned getCFAluSize(const MachineInstr *MI) const; - bool isCFAluEnabled(const MachineInstr *MI) const; - - /// IfCvt pass can generate "disabled" ALU clause marker that need to be - /// removed and their content affected to the previous alu clause. - /// This function parse instructions after CFAlu until it find a disabled - /// CFAlu and merge the content, or an enabled CFAlu. - void cleanPotentialDisabledCFAlu(MachineInstr *CFAlu) const; - - /// Check whether LatrCFAlu can be merged into RootCFAlu and do it if - /// it is the case. - bool mergeIfPossible(MachineInstr *RootCFAlu, const MachineInstr *LatrCFAlu) - const; - -public: - R600ClauseMergePass(TargetMachine &tm) : MachineFunctionPass(ID) { } - - bool runOnMachineFunction(MachineFunction &MF) override; - - const char *getPassName() const override; -}; - -char R600ClauseMergePass::ID = 0; - -unsigned R600ClauseMergePass::getCFAluSize(const MachineInstr *MI) const { - assert(isCFAlu(MI)); - return MI->getOperand( - TII->getOperandIdx(MI->getOpcode(), AMDGPU::OpName::COUNT)).getImm(); -} - -bool R600ClauseMergePass::isCFAluEnabled(const MachineInstr *MI) const { - assert(isCFAlu(MI)); - return MI->getOperand( - TII->getOperandIdx(MI->getOpcode(), AMDGPU::OpName::Enabled)).getImm(); -} - -void R600ClauseMergePass::cleanPotentialDisabledCFAlu(MachineInstr *CFAlu) - const { - int CntIdx = TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::COUNT); - MachineBasicBlock::iterator I = CFAlu, E = CFAlu->getParent()->end(); - I++; - do { - while (I!= E && !isCFAlu(I)) - I++; - if (I == E) - return; - MachineInstr *MI = I++; - if (isCFAluEnabled(MI)) - break; - CFAlu->getOperand(CntIdx).setImm(getCFAluSize(CFAlu) + getCFAluSize(MI)); - MI->eraseFromParent(); - } while (I != E); -} - -bool R600ClauseMergePass::mergeIfPossible(MachineInstr *RootCFAlu, - const MachineInstr *LatrCFAlu) const { - assert(isCFAlu(RootCFAlu) && isCFAlu(LatrCFAlu)); - int CntIdx = TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::COUNT); - unsigned RootInstCount = getCFAluSize(RootCFAlu), - LaterInstCount = getCFAluSize(LatrCFAlu); - unsigned CumuledInsts = RootInstCount + LaterInstCount; - if (CumuledInsts >= TII->getMaxAlusPerClause()) { - DEBUG(dbgs() << "Excess inst counts\n"); - return false; - } - if (RootCFAlu->getOpcode() == AMDGPU::CF_ALU_PUSH_BEFORE) - return false; - // Is KCache Bank 0 compatible ? - int Mode0Idx = - TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::KCACHE_MODE0); - int KBank0Idx = - TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::KCACHE_BANK0); - int KBank0LineIdx = - TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::KCACHE_ADDR0); - if (LatrCFAlu->getOperand(Mode0Idx).getImm() && - RootCFAlu->getOperand(Mode0Idx).getImm() && - (LatrCFAlu->getOperand(KBank0Idx).getImm() != - RootCFAlu->getOperand(KBank0Idx).getImm() || - LatrCFAlu->getOperand(KBank0LineIdx).getImm() != - RootCFAlu->getOperand(KBank0LineIdx).getImm())) { - DEBUG(dbgs() << "Wrong KC0\n"); - return false; - } - // Is KCache Bank 1 compatible ? - int Mode1Idx = - TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::KCACHE_MODE1); - int KBank1Idx = - TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::KCACHE_BANK1); - int KBank1LineIdx = - TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::KCACHE_ADDR1); - if (LatrCFAlu->getOperand(Mode1Idx).getImm() && - RootCFAlu->getOperand(Mode1Idx).getImm() && - (LatrCFAlu->getOperand(KBank1Idx).getImm() != - RootCFAlu->getOperand(KBank1Idx).getImm() || - LatrCFAlu->getOperand(KBank1LineIdx).getImm() != - RootCFAlu->getOperand(KBank1LineIdx).getImm())) { - DEBUG(dbgs() << "Wrong KC0\n"); - return false; - } - if (LatrCFAlu->getOperand(Mode0Idx).getImm()) { - RootCFAlu->getOperand(Mode0Idx).setImm( - LatrCFAlu->getOperand(Mode0Idx).getImm()); - RootCFAlu->getOperand(KBank0Idx).setImm( - LatrCFAlu->getOperand(KBank0Idx).getImm()); - RootCFAlu->getOperand(KBank0LineIdx).setImm( - LatrCFAlu->getOperand(KBank0LineIdx).getImm()); - } - if (LatrCFAlu->getOperand(Mode1Idx).getImm()) { - RootCFAlu->getOperand(Mode1Idx).setImm( - LatrCFAlu->getOperand(Mode1Idx).getImm()); - RootCFAlu->getOperand(KBank1Idx).setImm( - LatrCFAlu->getOperand(KBank1Idx).getImm()); - RootCFAlu->getOperand(KBank1LineIdx).setImm( - LatrCFAlu->getOperand(KBank1LineIdx).getImm()); - } - RootCFAlu->getOperand(CntIdx).setImm(CumuledInsts); - RootCFAlu->setDesc(TII->get(LatrCFAlu->getOpcode())); - return true; -} - -bool R600ClauseMergePass::runOnMachineFunction(MachineFunction &MF) { - TII = static_cast(MF.getSubtarget().getInstrInfo()); - for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end(); - BB != BB_E; ++BB) { - MachineBasicBlock &MBB = *BB; - MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); - MachineBasicBlock::iterator LatestCFAlu = E; - while (I != E) { - MachineInstr *MI = I++; - if ((!TII->canBeConsideredALU(MI) && !isCFAlu(MI)) || - TII->mustBeLastInClause(MI->getOpcode())) - LatestCFAlu = E; - if (!isCFAlu(MI)) - continue; - cleanPotentialDisabledCFAlu(MI); - - if (LatestCFAlu != E && mergeIfPossible(LatestCFAlu, MI)) { - MI->eraseFromParent(); - } else { - assert(MI->getOperand(8).getImm() && "CF ALU instruction disabled"); - LatestCFAlu = MI; - } - } - } - return false; -} - -const char *R600ClauseMergePass::getPassName() const { - return "R600 Merge Clause Markers Pass"; -} - -} // end anonymous namespace - - -llvm::FunctionPass *llvm::createR600ClauseMergePass(TargetMachine &TM) { - return new R600ClauseMergePass(TM); -} Index: projects/clang370-import/contrib/llvm/lib/Target/R600/SIFoldOperands.cpp =================================================================== --- projects/clang370-import/contrib/llvm/lib/Target/R600/SIFoldOperands.cpp (revision 288077) +++ projects/clang370-import/contrib/llvm/lib/Target/R600/SIFoldOperands.cpp (nonexistent) @@ -1,288 +0,0 @@ -//===-- SIFoldOperands.cpp - Fold operands --- ----------------------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -/// \file -//===----------------------------------------------------------------------===// -// - -#include "AMDGPU.h" -#include "AMDGPUSubtarget.h" -#include "SIInstrInfo.h" -#include "llvm/CodeGen/LiveIntervalAnalysis.h" -#include "llvm/CodeGen/MachineDominators.h" -#include "llvm/CodeGen/MachineFunctionPass.h" -#include "llvm/CodeGen/MachineInstrBuilder.h" -#include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/IR/Function.h" -#include "llvm/IR/LLVMContext.h" -#include "llvm/Support/Debug.h" -#include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetMachine.h" - -#define DEBUG_TYPE "si-fold-operands" -using namespace llvm; - -namespace { - -class SIFoldOperands : public MachineFunctionPass { -public: - static char ID; - -public: - SIFoldOperands() : MachineFunctionPass(ID) { - initializeSIFoldOperandsPass(*PassRegistry::getPassRegistry()); - } - - bool runOnMachineFunction(MachineFunction &MF) override; - - const char *getPassName() const override { - return "SI Fold Operands"; - } - - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired(); - AU.setPreservesCFG(); - MachineFunctionPass::getAnalysisUsage(AU); - } -}; - -struct FoldCandidate { - MachineInstr *UseMI; - unsigned UseOpNo; - MachineOperand *OpToFold; - uint64_t ImmToFold; - - FoldCandidate(MachineInstr *MI, unsigned OpNo, MachineOperand *FoldOp) : - UseMI(MI), UseOpNo(OpNo) { - - if (FoldOp->isImm()) { - OpToFold = nullptr; - ImmToFold = FoldOp->getImm(); - } else { - assert(FoldOp->isReg()); - OpToFold = FoldOp; - } - } - - bool isImm() const { - return !OpToFold; - } -}; - -} // End anonymous namespace. - -INITIALIZE_PASS_BEGIN(SIFoldOperands, DEBUG_TYPE, - "SI Fold Operands", false, false) -INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) -INITIALIZE_PASS_END(SIFoldOperands, DEBUG_TYPE, - "SI Fold Operands", false, false) - -char SIFoldOperands::ID = 0; - -char &llvm::SIFoldOperandsID = SIFoldOperands::ID; - -FunctionPass *llvm::createSIFoldOperandsPass() { - return new SIFoldOperands(); -} - -static bool isSafeToFold(unsigned Opcode) { - switch(Opcode) { - case AMDGPU::V_MOV_B32_e32: - case AMDGPU::V_MOV_B32_e64: - case AMDGPU::V_MOV_B64_PSEUDO: - case AMDGPU::S_MOV_B32: - case AMDGPU::S_MOV_B64: - case AMDGPU::COPY: - return true; - default: - return false; - } -} - -static bool updateOperand(FoldCandidate &Fold, - const TargetRegisterInfo &TRI) { - MachineInstr *MI = Fold.UseMI; - MachineOperand &Old = MI->getOperand(Fold.UseOpNo); - assert(Old.isReg()); - - if (Fold.isImm()) { - Old.ChangeToImmediate(Fold.ImmToFold); - return true; - } - - MachineOperand *New = Fold.OpToFold; - if (TargetRegisterInfo::isVirtualRegister(Old.getReg()) && - TargetRegisterInfo::isVirtualRegister(New->getReg())) { - Old.substVirtReg(New->getReg(), New->getSubReg(), TRI); - return true; - } - - // FIXME: Handle physical registers. - - return false; -} - -static bool tryAddToFoldList(std::vector &FoldList, - MachineInstr *MI, unsigned OpNo, - MachineOperand *OpToFold, - const SIInstrInfo *TII) { - if (!TII->isOperandLegal(MI, OpNo, OpToFold)) { - // Operand is not legal, so try to commute the instruction to - // see if this makes it possible to fold. - unsigned CommuteIdx0; - unsigned CommuteIdx1; - bool CanCommute = TII->findCommutedOpIndices(MI, CommuteIdx0, CommuteIdx1); - - if (CanCommute) { - if (CommuteIdx0 == OpNo) - OpNo = CommuteIdx1; - else if (CommuteIdx1 == OpNo) - OpNo = CommuteIdx0; - } - - if (!CanCommute || !TII->commuteInstruction(MI)) - return false; - - if (!TII->isOperandLegal(MI, OpNo, OpToFold)) - return false; - } - - FoldList.push_back(FoldCandidate(MI, OpNo, OpToFold)); - return true; -} - -bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) { - MachineRegisterInfo &MRI = MF.getRegInfo(); - const SIInstrInfo *TII = - static_cast(MF.getSubtarget().getInstrInfo()); - const SIRegisterInfo &TRI = TII->getRegisterInfo(); - - for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); - BI != BE; ++BI) { - - MachineBasicBlock &MBB = *BI; - MachineBasicBlock::iterator I, Next; - for (I = MBB.begin(); I != MBB.end(); I = Next) { - Next = std::next(I); - MachineInstr &MI = *I; - - if (!isSafeToFold(MI.getOpcode())) - continue; - - unsigned OpSize = TII->getOpSize(MI, 1); - MachineOperand &OpToFold = MI.getOperand(1); - bool FoldingImm = OpToFold.isImm(); - - // FIXME: We could also be folding things like FrameIndexes and - // TargetIndexes. - if (!FoldingImm && !OpToFold.isReg()) - continue; - - // Folding immediates with more than one use will increase program size. - // FIXME: This will also reduce register usage, which may be better - // in some cases. A better heuristic is needed. - if (FoldingImm && !TII->isInlineConstant(OpToFold, OpSize) && - !MRI.hasOneUse(MI.getOperand(0).getReg())) - continue; - - // FIXME: Fold operands with subregs. - if (OpToFold.isReg() && - (!TargetRegisterInfo::isVirtualRegister(OpToFold.getReg()) || - OpToFold.getSubReg())) - continue; - - std::vector FoldList; - for (MachineRegisterInfo::use_iterator - Use = MRI.use_begin(MI.getOperand(0).getReg()), E = MRI.use_end(); - Use != E; ++Use) { - - MachineInstr *UseMI = Use->getParent(); - const MachineOperand &UseOp = UseMI->getOperand(Use.getOperandNo()); - - // FIXME: Fold operands with subregs. - if (UseOp.isReg() && ((UseOp.getSubReg() && OpToFold.isReg()) || - UseOp.isImplicit())) { - continue; - } - - APInt Imm; - - if (FoldingImm) { - unsigned UseReg = UseOp.getReg(); - const TargetRegisterClass *UseRC - = TargetRegisterInfo::isVirtualRegister(UseReg) ? - MRI.getRegClass(UseReg) : - TRI.getPhysRegClass(UseReg); - - Imm = APInt(64, OpToFold.getImm()); - - // Split 64-bit constants into 32-bits for folding. - if (UseOp.getSubReg()) { - if (UseRC->getSize() != 8) - continue; - - if (UseOp.getSubReg() == AMDGPU::sub0) { - Imm = Imm.getLoBits(32); - } else { - assert(UseOp.getSubReg() == AMDGPU::sub1); - Imm = Imm.getHiBits(32); - } - } - - // In order to fold immediates into copies, we need to change the - // copy to a MOV. - if (UseMI->getOpcode() == AMDGPU::COPY) { - unsigned DestReg = UseMI->getOperand(0).getReg(); - const TargetRegisterClass *DestRC - = TargetRegisterInfo::isVirtualRegister(DestReg) ? - MRI.getRegClass(DestReg) : - TRI.getPhysRegClass(DestReg); - - unsigned MovOp = TII->getMovOpcode(DestRC); - if (MovOp == AMDGPU::COPY) - continue; - - UseMI->setDesc(TII->get(MovOp)); - } - } - - const MCInstrDesc &UseDesc = UseMI->getDesc(); - - // Don't fold into target independent nodes. Target independent opcodes - // don't have defined register classes. - if (UseDesc.isVariadic() || - UseDesc.OpInfo[Use.getOperandNo()].RegClass == -1) - continue; - - if (FoldingImm) { - MachineOperand ImmOp = MachineOperand::CreateImm(Imm.getSExtValue()); - tryAddToFoldList(FoldList, UseMI, Use.getOperandNo(), &ImmOp, TII); - continue; - } - - tryAddToFoldList(FoldList, UseMI, Use.getOperandNo(), &OpToFold, TII); - - // FIXME: We could try to change the instruction from 64-bit to 32-bit - // to enable more folding opportunites. The shrink operands pass - // already does this. - } - - for (FoldCandidate &Fold : FoldList) { - if (updateOperand(Fold, TRI)) { - // Clear kill flags. - if (!Fold.isImm()) { - assert(Fold.OpToFold && Fold.OpToFold->isReg()); - Fold.OpToFold->setIsKill(false); - } - DEBUG(dbgs() << "Folded source from " << MI << " into OpNo " << - Fold.UseOpNo << " of " << *Fold.UseMI << '\n'); - } - } - } - } - return false; -} Property changes on: projects/clang370-import/contrib/llvm/lib/Target/R600/SIFoldOperands.cpp ___________________________________________________________________ Deleted: svn:eol-style ## -1 +0,0 ## -native \ No newline at end of property Deleted: svn:keywords ## -1 +0,0 ## -FreeBSD=%H \ No newline at end of property Deleted: svn:mime-type ## -1 +0,0 ## -text/plain \ No newline at end of property Index: projects/clang370-import/contrib/llvm/lib/Target/R600/EvergreenInstructions.td =================================================================== --- projects/clang370-import/contrib/llvm/lib/Target/R600/EvergreenInstructions.td (revision 288077) +++ projects/clang370-import/contrib/llvm/lib/Target/R600/EvergreenInstructions.td (nonexistent) @@ -1,670 +0,0 @@ -//===-- EvergreenInstructions.td - EG Instruction defs ----*- tablegen -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// TableGen definitions for instructions which are: -// - Available to Evergreen and newer VLIW4/VLIW5 GPUs -// - Available only on Evergreen family GPUs. -// -//===----------------------------------------------------------------------===// - -def isEG : Predicate< - "Subtarget->getGeneration() >= AMDGPUSubtarget::EVERGREEN && " - "Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS && " - "!Subtarget->hasCaymanISA()" ->; - -def isEGorCayman : Predicate< - "Subtarget->getGeneration() == AMDGPUSubtarget::EVERGREEN ||" - "Subtarget->getGeneration() ==AMDGPUSubtarget::NORTHERN_ISLANDS" ->; - -//===----------------------------------------------------------------------===// -// Evergreen / Cayman store instructions -//===----------------------------------------------------------------------===// - -let Predicates = [isEGorCayman] in { - -class CF_MEM_RAT_CACHELESS rat_inst, bits<4> rat_id, bits<4> mask, dag ins, - string name, list pattern> - : EG_CF_RAT <0x57, rat_inst, rat_id, mask, (outs), ins, - "MEM_RAT_CACHELESS "#name, pattern>; - -class CF_MEM_RAT rat_inst, bits<4> rat_id, dag ins, string name, - list pattern> - : EG_CF_RAT <0x56, rat_inst, rat_id, 0xf /* mask */, (outs), ins, - "MEM_RAT "#name, pattern>; - -def RAT_MSKOR : CF_MEM_RAT <0x11, 0, - (ins R600_Reg128:$rw_gpr, R600_TReg32_X:$index_gpr), - "MSKOR $rw_gpr.XW, $index_gpr", - [(mskor_global v4i32:$rw_gpr, i32:$index_gpr)] -> { - let eop = 0; -} - -} // End let Predicates = [isEGorCayman] - -//===----------------------------------------------------------------------===// -// Evergreen Only instructions -//===----------------------------------------------------------------------===// - -let Predicates = [isEG] in { - -def RECIP_IEEE_eg : RECIP_IEEE_Common<0x86>; -defm DIV_eg : DIV_Common; - -def MULLO_INT_eg : MULLO_INT_Common<0x8F>; -def MULHI_INT_eg : MULHI_INT_Common<0x90>; -def MULLO_UINT_eg : MULLO_UINT_Common<0x91>; -def MULHI_UINT_eg : MULHI_UINT_Common<0x92>; -def RECIP_UINT_eg : RECIP_UINT_Common<0x94>; -def RECIPSQRT_CLAMPED_eg : RECIPSQRT_CLAMPED_Common<0x87>; -def EXP_IEEE_eg : EXP_IEEE_Common<0x81>; -def LOG_IEEE_eg : LOG_IEEE_Common<0x83>; -def RECIP_CLAMPED_eg : RECIP_CLAMPED_Common<0x84>; -def RECIPSQRT_IEEE_eg : RECIPSQRT_IEEE_Common<0x89>; -def : RsqPat; -def SIN_eg : SIN_Common<0x8D>; -def COS_eg : COS_Common<0x8E>; - -def : POW_Common ; -def : Pat<(fsqrt f32:$src), (MUL $src, (RECIPSQRT_CLAMPED_eg $src))>; - -defm : Expand24IBitOps; - -//===----------------------------------------------------------------------===// -// Memory read/write instructions -//===----------------------------------------------------------------------===// - -let usesCustomInserter = 1 in { - -// 32-bit store -def RAT_WRITE_CACHELESS_32_eg : CF_MEM_RAT_CACHELESS <0x2, 0, 0x1, - (ins R600_TReg32_X:$rw_gpr, R600_TReg32_X:$index_gpr, InstFlag:$eop), - "STORE_RAW $rw_gpr, $index_gpr, $eop", - [(global_store i32:$rw_gpr, i32:$index_gpr)] ->; - -// 64-bit store -def RAT_WRITE_CACHELESS_64_eg : CF_MEM_RAT_CACHELESS <0x2, 0, 0x3, - (ins R600_Reg64:$rw_gpr, R600_TReg32_X:$index_gpr, InstFlag:$eop), - "STORE_RAW $rw_gpr.XY, $index_gpr, $eop", - [(global_store v2i32:$rw_gpr, i32:$index_gpr)] ->; - -//128-bit store -def RAT_WRITE_CACHELESS_128_eg : CF_MEM_RAT_CACHELESS <0x2, 0, 0xf, - (ins R600_Reg128:$rw_gpr, R600_TReg32_X:$index_gpr, InstFlag:$eop), - "STORE_RAW $rw_gpr.XYZW, $index_gpr, $eop", - [(global_store v4i32:$rw_gpr, i32:$index_gpr)] ->; - -} // End usesCustomInserter = 1 - -class VTX_READ_eg buffer_id, dag outs, list pattern> - : VTX_WORD0_eg, VTX_READ { - - // Static fields - let VC_INST = 0; - let FETCH_TYPE = 2; - let FETCH_WHOLE_QUAD = 0; - let BUFFER_ID = buffer_id; - let SRC_REL = 0; - // XXX: We can infer this field based on the SRC_GPR. This would allow us - // to store vertex addresses in any channel, not just X. - let SRC_SEL_X = 0; - - let Inst{31-0} = Word0; -} - -class VTX_READ_8_eg buffer_id, list pattern> - : VTX_READ_eg <"VTX_READ_8 $dst_gpr, $src_gpr", buffer_id, - (outs R600_TReg32_X:$dst_gpr), pattern> { - - let MEGA_FETCH_COUNT = 1; - let DST_SEL_X = 0; - let DST_SEL_Y = 7; // Masked - let DST_SEL_Z = 7; // Masked - let DST_SEL_W = 7; // Masked - let DATA_FORMAT = 1; // FMT_8 -} - -class VTX_READ_16_eg buffer_id, list pattern> - : VTX_READ_eg <"VTX_READ_16 $dst_gpr, $src_gpr", buffer_id, - (outs R600_TReg32_X:$dst_gpr), pattern> { - let MEGA_FETCH_COUNT = 2; - let DST_SEL_X = 0; - let DST_SEL_Y = 7; // Masked - let DST_SEL_Z = 7; // Masked - let DST_SEL_W = 7; // Masked - let DATA_FORMAT = 5; // FMT_16 - -} - -class VTX_READ_32_eg buffer_id, list pattern> - : VTX_READ_eg <"VTX_READ_32 $dst_gpr, $src_gpr", buffer_id, - (outs R600_TReg32_X:$dst_gpr), pattern> { - - let MEGA_FETCH_COUNT = 4; - let DST_SEL_X = 0; - let DST_SEL_Y = 7; // Masked - let DST_SEL_Z = 7; // Masked - let DST_SEL_W = 7; // Masked - let DATA_FORMAT = 0xD; // COLOR_32 - - // This is not really necessary, but there were some GPU hangs that appeared - // to be caused by ALU instructions in the next instruction group that wrote - // to the $src_gpr registers of the VTX_READ. - // e.g. - // %T3_X = VTX_READ_PARAM_32_eg %T2_X, 24 - // %T2_X = MOV %ZERO - //Adding this constraint prevents this from happening. - let Constraints = "$src_gpr.ptr = $dst_gpr"; -} - -class VTX_READ_64_eg buffer_id, list pattern> - : VTX_READ_eg <"VTX_READ_64 $dst_gpr.XY, $src_gpr", buffer_id, - (outs R600_Reg64:$dst_gpr), pattern> { - - let MEGA_FETCH_COUNT = 8; - let DST_SEL_X = 0; - let DST_SEL_Y = 1; - let DST_SEL_Z = 7; - let DST_SEL_W = 7; - let DATA_FORMAT = 0x1D; // COLOR_32_32 -} - -class VTX_READ_128_eg buffer_id, list pattern> - : VTX_READ_eg <"VTX_READ_128 $dst_gpr.XYZW, $src_gpr", buffer_id, - (outs R600_Reg128:$dst_gpr), pattern> { - - let MEGA_FETCH_COUNT = 16; - let DST_SEL_X = 0; - let DST_SEL_Y = 1; - let DST_SEL_Z = 2; - let DST_SEL_W = 3; - let DATA_FORMAT = 0x22; // COLOR_32_32_32_32 - - // XXX: Need to force VTX_READ_128 instructions to write to the same register - // that holds its buffer address to avoid potential hangs. We can't use - // the same constraint as VTX_READ_32_eg, because the $src_gpr.ptr and $dst - // registers are different sizes. -} - -//===----------------------------------------------------------------------===// -// VTX Read from parameter memory space -//===----------------------------------------------------------------------===// - -def VTX_READ_PARAM_8_eg : VTX_READ_8_eg <0, - [(set i32:$dst_gpr, (load_param_exti8 ADDRVTX_READ:$src_gpr))] ->; - -def VTX_READ_PARAM_16_eg : VTX_READ_16_eg <0, - [(set i32:$dst_gpr, (load_param_exti16 ADDRVTX_READ:$src_gpr))] ->; - -def VTX_READ_PARAM_32_eg : VTX_READ_32_eg <0, - [(set i32:$dst_gpr, (load_param ADDRVTX_READ:$src_gpr))] ->; - -def VTX_READ_PARAM_64_eg : VTX_READ_64_eg <0, - [(set v2i32:$dst_gpr, (load_param ADDRVTX_READ:$src_gpr))] ->; - -def VTX_READ_PARAM_128_eg : VTX_READ_128_eg <0, - [(set v4i32:$dst_gpr, (load_param ADDRVTX_READ:$src_gpr))] ->; - -//===----------------------------------------------------------------------===// -// VTX Read from global memory space -//===----------------------------------------------------------------------===// - -// 8-bit reads -def VTX_READ_GLOBAL_8_eg : VTX_READ_8_eg <1, - [(set i32:$dst_gpr, (az_extloadi8_global ADDRVTX_READ:$src_gpr))] ->; - -def VTX_READ_GLOBAL_16_eg : VTX_READ_16_eg <1, - [(set i32:$dst_gpr, (az_extloadi16_global ADDRVTX_READ:$src_gpr))] ->; - -// 32-bit reads -def VTX_READ_GLOBAL_32_eg : VTX_READ_32_eg <1, - [(set i32:$dst_gpr, (global_load ADDRVTX_READ:$src_gpr))] ->; - -// 64-bit reads -def VTX_READ_GLOBAL_64_eg : VTX_READ_64_eg <1, - [(set v2i32:$dst_gpr, (global_load ADDRVTX_READ:$src_gpr))] ->; - -// 128-bit reads -def VTX_READ_GLOBAL_128_eg : VTX_READ_128_eg <1, - [(set v4i32:$dst_gpr, (global_load ADDRVTX_READ:$src_gpr))] ->; - -} // End Predicates = [isEG] - -//===----------------------------------------------------------------------===// -// Evergreen / Cayman Instructions -//===----------------------------------------------------------------------===// - -let Predicates = [isEGorCayman] in { - -// Should be predicated on FeatureFP64 -// def FMA_64 : R600_3OP < -// 0xA, "FMA_64", -// [(set f64:$dst, (fma f64:$src0, f64:$src1, f64:$src2))] -// >; - -// BFE_UINT - bit_extract, an optimization for mask and shift -// Src0 = Input -// Src1 = Offset -// Src2 = Width -// -// bit_extract = (Input << (32 - Offset - Width)) >> (32 - Width) -// -// Example Usage: -// (Offset, Width) -// -// (0, 8) = (Input << 24) >> 24 = (Input & 0xff) >> 0 -// (8, 8) = (Input << 16) >> 24 = (Input & 0xffff) >> 8 -// (16, 8) = (Input << 8) >> 24 = (Input & 0xffffff) >> 16 -// (24, 8) = (Input << 0) >> 24 = (Input & 0xffffffff) >> 24 -def BFE_UINT_eg : R600_3OP <0x4, "BFE_UINT", - [(set i32:$dst, (AMDGPUbfe_u32 i32:$src0, i32:$src1, i32:$src2))], - VecALU ->; - -def BFE_INT_eg : R600_3OP <0x5, "BFE_INT", - [(set i32:$dst, (AMDGPUbfe_i32 i32:$src0, i32:$src1, i32:$src2))], - VecALU ->; - -def : BFEPattern ; - -def BFI_INT_eg : R600_3OP <0x06, "BFI_INT", - [(set i32:$dst, (AMDGPUbfi i32:$src0, i32:$src1, i32:$src2))], - VecALU ->; - -def : Pat<(i32 (sext_inreg i32:$src, i1)), - (BFE_INT_eg i32:$src, (i32 ZERO), (i32 ONE_INT))>; -def : Pat<(i32 (sext_inreg i32:$src, i8)), - (BFE_INT_eg i32:$src, (i32 ZERO), (MOV_IMM_I32 8))>; -def : Pat<(i32 (sext_inreg i32:$src, i16)), - (BFE_INT_eg i32:$src, (i32 ZERO), (MOV_IMM_I32 16))>; - -defm : BFIPatterns ; - -def BFM_INT_eg : R600_2OP <0xA0, "BFM_INT", - [(set i32:$dst, (AMDGPUbfm i32:$src0, i32:$src1))], - VecALU ->; - -def MULADD_UINT24_eg : R600_3OP <0x10, "MULADD_UINT24", - [(set i32:$dst, (AMDGPUmad_u24 i32:$src0, i32:$src1, i32:$src2))], VecALU ->; - -def : UMad24Pat; - -def BIT_ALIGN_INT_eg : R600_3OP <0xC, "BIT_ALIGN_INT", [], VecALU>; -def : ROTRPattern ; -def MULADD_eg : MULADD_Common<0x14>; -def MULADD_IEEE_eg : MULADD_IEEE_Common<0x18>; -def FMA_eg : FMA_Common<0x7>; -def ASHR_eg : ASHR_Common<0x15>; -def LSHR_eg : LSHR_Common<0x16>; -def LSHL_eg : LSHL_Common<0x17>; -def CNDE_eg : CNDE_Common<0x19>; -def CNDGT_eg : CNDGT_Common<0x1A>; -def CNDGE_eg : CNDGE_Common<0x1B>; -def MUL_LIT_eg : MUL_LIT_Common<0x1F>; -def LOG_CLAMPED_eg : LOG_CLAMPED_Common<0x82>; -def MUL_UINT24_eg : R600_2OP <0xB5, "MUL_UINT24", - [(set i32:$dst, (AMDGPUmul_u24 i32:$src0, i32:$src1))], VecALU ->; -def DOT4_eg : DOT4_Common<0xBE>; -defm CUBE_eg : CUBE_Common<0xC0>; - -def BCNT_INT : R600_1OP_Helper <0xAA, "BCNT_INT", ctpop, VecALU>; - -def ADDC_UINT : R600_2OP_Helper <0x52, "ADDC_UINT", AMDGPUcarry>; -def SUBB_UINT : R600_2OP_Helper <0x53, "SUBB_UINT", AMDGPUborrow>; - -def FFBH_UINT : R600_1OP_Helper <0xAB, "FFBH_UINT", ctlz_zero_undef, VecALU>; -def FFBL_INT : R600_1OP_Helper <0xAC, "FFBL_INT", cttz_zero_undef, VecALU>; - -let hasSideEffects = 1 in { - def MOVA_INT_eg : R600_1OP <0xCC, "MOVA_INT", [], VecALU>; -} - -def TGSI_LIT_Z_eg : TGSI_LIT_Z_Common; - -def FLT_TO_INT_eg : FLT_TO_INT_Common<0x50> { - let Pattern = []; - let Itinerary = AnyALU; -} - -def INT_TO_FLT_eg : INT_TO_FLT_Common<0x9B>; - -def FLT_TO_UINT_eg : FLT_TO_UINT_Common<0x9A> { - let Pattern = []; -} - -def UINT_TO_FLT_eg : UINT_TO_FLT_Common<0x9C>; - -def GROUP_BARRIER : InstR600 < - (outs), (ins), " GROUP_BARRIER", [(int_AMDGPU_barrier_local), (int_AMDGPU_barrier_global)], AnyALU>, - R600ALU_Word0, - R600ALU_Word1_OP2 <0x54> { - - let dst = 0; - let dst_rel = 0; - let src0 = 0; - let src0_rel = 0; - let src0_neg = 0; - let src0_abs = 0; - let src1 = 0; - let src1_rel = 0; - let src1_neg = 0; - let src1_abs = 0; - let write = 0; - let omod = 0; - let clamp = 0; - let last = 1; - let bank_swizzle = 0; - let pred_sel = 0; - let update_exec_mask = 0; - let update_pred = 0; - - let Inst{31-0} = Word0; - let Inst{63-32} = Word1; - - let ALUInst = 1; -} - -def : Pat < - (int_AMDGPU_barrier_global), - (GROUP_BARRIER) ->; - -//===----------------------------------------------------------------------===// -// LDS Instructions -//===----------------------------------------------------------------------===// -class R600_LDS op, dag outs, dag ins, string asm, - list pattern = []> : - - InstR600 , - R600_ALU_LDS_Word0, - R600LDS_Word1 { - - bits<6> offset = 0; - let lds_op = op; - - let Word1{27} = offset{0}; - let Word1{12} = offset{1}; - let Word1{28} = offset{2}; - let Word1{31} = offset{3}; - let Word0{12} = offset{4}; - let Word0{25} = offset{5}; - - - let Inst{31-0} = Word0; - let Inst{63-32} = Word1; - - let ALUInst = 1; - let HasNativeOperands = 1; - let UseNamedOperandTable = 1; -} - -class R600_LDS_1A lds_op, string name, list pattern> : R600_LDS < - lds_op, - (outs R600_Reg32:$dst), - (ins R600_Reg32:$src0, REL:$src0_rel, SEL:$src0_sel, - LAST:$last, R600_Pred:$pred_sel, - BANK_SWIZZLE:$bank_swizzle), - " "#name#" $last OQAP, $src0$src0_rel $pred_sel", - pattern - > { - - let src1 = 0; - let src1_rel = 0; - let src2 = 0; - let src2_rel = 0; - - let usesCustomInserter = 1; - let LDS_1A = 1; - let DisableEncoding = "$dst"; -} - -class R600_LDS_1A1D lds_op, dag outs, string name, list pattern, - string dst =""> : - R600_LDS < - lds_op, outs, - (ins R600_Reg32:$src0, REL:$src0_rel, SEL:$src0_sel, - R600_Reg32:$src1, REL:$src1_rel, SEL:$src1_sel, - LAST:$last, R600_Pred:$pred_sel, - BANK_SWIZZLE:$bank_swizzle), - " "#name#" $last "#dst#"$src0$src0_rel, $src1$src1_rel, $pred_sel", - pattern - > { - - field string BaseOp; - - let src2 = 0; - let src2_rel = 0; - let LDS_1A1D = 1; -} - -class R600_LDS_1A1D_NORET lds_op, string name, list pattern> : - R600_LDS_1A1D { - let BaseOp = name; -} - -class R600_LDS_1A1D_RET lds_op, string name, list pattern> : - R600_LDS_1A1D { - - let BaseOp = name; - let usesCustomInserter = 1; - let DisableEncoding = "$dst"; -} - -class R600_LDS_1A2D lds_op, dag outs, string name, list pattern, - string dst =""> : - R600_LDS < - lds_op, outs, - (ins R600_Reg32:$src0, REL:$src0_rel, SEL:$src0_sel, - R600_Reg32:$src1, REL:$src1_rel, SEL:$src1_sel, - R600_Reg32:$src2, REL:$src2_rel, SEL:$src2_sel, - LAST:$last, R600_Pred:$pred_sel, BANK_SWIZZLE:$bank_swizzle), - " "#name# "$last "#dst#"$src0$src0_rel, $src1$src1_rel, $src2$src2_rel, $pred_sel", - pattern> { - - field string BaseOp; - - let LDS_1A1D = 0; - let LDS_1A2D = 1; -} - -class R600_LDS_1A2D_NORET lds_op, string name, list pattern> : - R600_LDS_1A2D { - let BaseOp = name; -} - -class R600_LDS_1A2D_RET lds_op, string name, list pattern> : - R600_LDS_1A2D { - - let BaseOp = name; - let usesCustomInserter = 1; - let DisableEncoding = "$dst"; -} - -def LDS_ADD : R600_LDS_1A1D_NORET <0x0, "LDS_ADD", [] >; -def LDS_SUB : R600_LDS_1A1D_NORET <0x1, "LDS_SUB", [] >; -def LDS_AND : R600_LDS_1A1D_NORET <0x9, "LDS_AND", [] >; -def LDS_OR : R600_LDS_1A1D_NORET <0xa, "LDS_OR", [] >; -def LDS_XOR : R600_LDS_1A1D_NORET <0xb, "LDS_XOR", [] >; -def LDS_WRXCHG: R600_LDS_1A1D_NORET <0xd, "LDS_WRXCHG", [] >; -def LDS_CMPST: R600_LDS_1A2D_NORET <0x10, "LDS_CMPST", [] >; -def LDS_MIN_INT : R600_LDS_1A1D_NORET <0x5, "LDS_MIN_INT", [] >; -def LDS_MAX_INT : R600_LDS_1A1D_NORET <0x6, "LDS_MAX_INT", [] >; -def LDS_MIN_UINT : R600_LDS_1A1D_NORET <0x7, "LDS_MIN_UINT", [] >; -def LDS_MAX_UINT : R600_LDS_1A1D_NORET <0x8, "LDS_MAX_UINT", [] >; -def LDS_WRITE : R600_LDS_1A1D_NORET <0xD, "LDS_WRITE", - [(local_store (i32 R600_Reg32:$src1), R600_Reg32:$src0)] ->; -def LDS_BYTE_WRITE : R600_LDS_1A1D_NORET<0x12, "LDS_BYTE_WRITE", - [(truncstorei8_local i32:$src1, i32:$src0)] ->; -def LDS_SHORT_WRITE : R600_LDS_1A1D_NORET<0x13, "LDS_SHORT_WRITE", - [(truncstorei16_local i32:$src1, i32:$src0)] ->; -def LDS_ADD_RET : R600_LDS_1A1D_RET <0x20, "LDS_ADD", - [(set i32:$dst, (atomic_load_add_local i32:$src0, i32:$src1))] ->; -def LDS_SUB_RET : R600_LDS_1A1D_RET <0x21, "LDS_SUB", - [(set i32:$dst, (atomic_load_sub_local i32:$src0, i32:$src1))] ->; -def LDS_AND_RET : R600_LDS_1A1D_RET <0x29, "LDS_AND", - [(set i32:$dst, (atomic_load_and_local i32:$src0, i32:$src1))] ->; -def LDS_OR_RET : R600_LDS_1A1D_RET <0x2a, "LDS_OR", - [(set i32:$dst, (atomic_load_or_local i32:$src0, i32:$src1))] ->; -def LDS_XOR_RET : R600_LDS_1A1D_RET <0x2b, "LDS_XOR", - [(set i32:$dst, (atomic_load_xor_local i32:$src0, i32:$src1))] ->; -def LDS_MIN_INT_RET : R600_LDS_1A1D_RET <0x25, "LDS_MIN_INT", - [(set i32:$dst, (atomic_load_min_local i32:$src0, i32:$src1))] ->; -def LDS_MAX_INT_RET : R600_LDS_1A1D_RET <0x26, "LDS_MAX_INT", - [(set i32:$dst, (atomic_load_max_local i32:$src0, i32:$src1))] ->; -def LDS_MIN_UINT_RET : R600_LDS_1A1D_RET <0x27, "LDS_MIN_UINT", - [(set i32:$dst, (atomic_load_umin_local i32:$src0, i32:$src1))] ->; -def LDS_MAX_UINT_RET : R600_LDS_1A1D_RET <0x28, "LDS_MAX_UINT", - [(set i32:$dst, (atomic_load_umax_local i32:$src0, i32:$src1))] ->; -def LDS_WRXCHG_RET : R600_LDS_1A1D_RET <0x2d, "LDS_WRXCHG", - [(set i32:$dst, (atomic_swap_local i32:$src0, i32:$src1))] ->; -def LDS_CMPST_RET : R600_LDS_1A2D_RET <0x30, "LDS_CMPST", - [(set i32:$dst, (atomic_cmp_swap_32_local i32:$src0, i32:$src1, i32:$src2))] ->; -def LDS_READ_RET : R600_LDS_1A <0x32, "LDS_READ_RET", - [(set (i32 R600_Reg32:$dst), (local_load R600_Reg32:$src0))] ->; -def LDS_BYTE_READ_RET : R600_LDS_1A <0x36, "LDS_BYTE_READ_RET", - [(set i32:$dst, (sextloadi8_local i32:$src0))] ->; -def LDS_UBYTE_READ_RET : R600_LDS_1A <0x37, "LDS_UBYTE_READ_RET", - [(set i32:$dst, (az_extloadi8_local i32:$src0))] ->; -def LDS_SHORT_READ_RET : R600_LDS_1A <0x38, "LDS_SHORT_READ_RET", - [(set i32:$dst, (sextloadi16_local i32:$src0))] ->; -def LDS_USHORT_READ_RET : R600_LDS_1A <0x39, "LDS_USHORT_READ_RET", - [(set i32:$dst, (az_extloadi16_local i32:$src0))] ->; - -// TRUNC is used for the FLT_TO_INT instructions to work around a -// perceived problem where the rounding modes are applied differently -// depending on the instruction and the slot they are in. -// See: -// https://bugs.freedesktop.org/show_bug.cgi?id=50232 -// Mesa commit: a1a0974401c467cb86ef818f22df67c21774a38c -// -// XXX: Lowering SELECT_CC will sometimes generate fp_to_[su]int nodes, -// which do not need to be truncated since the fp values are 0.0f or 1.0f. -// We should look into handling these cases separately. -def : Pat<(fp_to_sint f32:$src0), (FLT_TO_INT_eg (TRUNC $src0))>; - -def : Pat<(fp_to_uint f32:$src0), (FLT_TO_UINT_eg (TRUNC $src0))>; - -// SHA-256 Patterns -def : SHA256MaPattern ; - -def EG_ExportSwz : ExportSwzInst { - let Word1{19-16} = 0; // BURST_COUNT - let Word1{20} = 0; // VALID_PIXEL_MODE - let Word1{21} = eop; - let Word1{29-22} = inst; - let Word1{30} = 0; // MARK - let Word1{31} = 1; // BARRIER -} -defm : ExportPattern; - -def EG_ExportBuf : ExportBufInst { - let Word1{19-16} = 0; // BURST_COUNT - let Word1{20} = 0; // VALID_PIXEL_MODE - let Word1{21} = eop; - let Word1{29-22} = inst; - let Word1{30} = 0; // MARK - let Word1{31} = 1; // BARRIER -} -defm : SteamOutputExportPattern; - -def CF_TC_EG : CF_CLAUSE_EG<1, (ins i32imm:$ADDR, i32imm:$COUNT), - "TEX $COUNT @$ADDR"> { - let POP_COUNT = 0; -} -def CF_VC_EG : CF_CLAUSE_EG<2, (ins i32imm:$ADDR, i32imm:$COUNT), - "VTX $COUNT @$ADDR"> { - let POP_COUNT = 0; -} -def WHILE_LOOP_EG : CF_CLAUSE_EG<6, (ins i32imm:$ADDR), - "LOOP_START_DX10 @$ADDR"> { - let POP_COUNT = 0; - let COUNT = 0; -} -def END_LOOP_EG : CF_CLAUSE_EG<5, (ins i32imm:$ADDR), "END_LOOP @$ADDR"> { - let POP_COUNT = 0; - let COUNT = 0; -} -def LOOP_BREAK_EG : CF_CLAUSE_EG<9, (ins i32imm:$ADDR), - "LOOP_BREAK @$ADDR"> { - let POP_COUNT = 0; - let COUNT = 0; -} -def CF_CONTINUE_EG : CF_CLAUSE_EG<8, (ins i32imm:$ADDR), - "CONTINUE @$ADDR"> { - let POP_COUNT = 0; - let COUNT = 0; -} -def CF_JUMP_EG : CF_CLAUSE_EG<10, (ins i32imm:$ADDR, i32imm:$POP_COUNT), - "JUMP @$ADDR POP:$POP_COUNT"> { - let COUNT = 0; -} -def CF_PUSH_EG : CF_CLAUSE_EG<11, (ins i32imm:$ADDR, i32imm:$POP_COUNT), - "PUSH @$ADDR POP:$POP_COUNT"> { - let COUNT = 0; -} -def CF_ELSE_EG : CF_CLAUSE_EG<13, (ins i32imm:$ADDR, i32imm:$POP_COUNT), - "ELSE @$ADDR POP:$POP_COUNT"> { - let COUNT = 0; -} -def CF_CALL_FS_EG : CF_CLAUSE_EG<19, (ins), "CALL_FS"> { - let ADDR = 0; - let COUNT = 0; - let POP_COUNT = 0; -} -def POP_EG : CF_CLAUSE_EG<14, (ins i32imm:$ADDR, i32imm:$POP_COUNT), - "POP @$ADDR POP:$POP_COUNT"> { - let COUNT = 0; -} -def CF_END_EG : CF_CLAUSE_EG<0, (ins), "CF_END"> { - let COUNT = 0; - let POP_COUNT = 0; - let ADDR = 0; - let END_OF_PROGRAM = 1; -} - -} // End Predicates = [isEGorCayman] Index: projects/clang370-import/contrib/llvm/lib/Target/R600/R600OptimizeVectorRegisters.cpp =================================================================== --- projects/clang370-import/contrib/llvm/lib/Target/R600/R600OptimizeVectorRegisters.cpp (revision 288077) +++ projects/clang370-import/contrib/llvm/lib/Target/R600/R600OptimizeVectorRegisters.cpp (nonexistent) @@ -1,382 +0,0 @@ -//===--------------------- R600MergeVectorRegisters.cpp -------------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -/// \file -/// This pass merges inputs of swizzeable instructions into vector sharing -/// common data and/or have enough undef subreg using swizzle abilities. -/// -/// For instance let's consider the following pseudo code : -/// vreg5 = REG_SEQ vreg1, sub0, vreg2, sub1, vreg3, sub2, undef, sub3 -/// ... -/// vreg7 = REG_SEQ vreg1, sub0, vreg3, sub1, undef, sub2, vreg4, sub3 -/// (swizzable Inst) vreg7, SwizzleMask : sub0, sub1, sub2, sub3 -/// -/// is turned into : -/// vreg5 = REG_SEQ vreg1, sub0, vreg2, sub1, vreg3, sub2, undef, sub3 -/// ... -/// vreg7 = INSERT_SUBREG vreg4, sub3 -/// (swizzable Inst) vreg7, SwizzleMask : sub0, sub2, sub1, sub3 -/// -/// This allow regalloc to reduce register pressure for vector registers and -/// to reduce MOV count. -//===----------------------------------------------------------------------===// - -#include "AMDGPU.h" -#include "AMDGPUSubtarget.h" -#include "R600InstrInfo.h" -#include "llvm/CodeGen/DFAPacketizer.h" -#include "llvm/CodeGen/MachineDominators.h" -#include "llvm/CodeGen/MachineFunctionPass.h" -#include "llvm/CodeGen/MachineInstrBuilder.h" -#include "llvm/CodeGen/MachineLoopInfo.h" -#include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/CodeGen/Passes.h" -#include "llvm/Support/Debug.h" -#include "llvm/Support/raw_ostream.h" - -using namespace llvm; - -#define DEBUG_TYPE "vec-merger" - -namespace { - -static bool -isImplicitlyDef(MachineRegisterInfo &MRI, unsigned Reg) { - for (MachineRegisterInfo::def_instr_iterator It = MRI.def_instr_begin(Reg), - E = MRI.def_instr_end(); It != E; ++It) { - return (*It).isImplicitDef(); - } - if (MRI.isReserved(Reg)) { - return false; - } - llvm_unreachable("Reg without a def"); - return false; -} - -class RegSeqInfo { -public: - MachineInstr *Instr; - DenseMap RegToChan; - std::vector UndefReg; - RegSeqInfo(MachineRegisterInfo &MRI, MachineInstr *MI) : Instr(MI) { - assert(MI->getOpcode() == AMDGPU::REG_SEQUENCE); - for (unsigned i = 1, e = Instr->getNumOperands(); i < e; i+=2) { - MachineOperand &MO = Instr->getOperand(i); - unsigned Chan = Instr->getOperand(i + 1).getImm(); - if (isImplicitlyDef(MRI, MO.getReg())) - UndefReg.push_back(Chan); - else - RegToChan[MO.getReg()] = Chan; - } - } - RegSeqInfo() {} - - bool operator==(const RegSeqInfo &RSI) const { - return RSI.Instr == Instr; - } -}; - -class R600VectorRegMerger : public MachineFunctionPass { -private: - MachineRegisterInfo *MRI; - const R600InstrInfo *TII; - bool canSwizzle(const MachineInstr &) const; - bool areAllUsesSwizzeable(unsigned Reg) const; - void SwizzleInput(MachineInstr &, - const std::vector > &) const; - bool tryMergeVector(const RegSeqInfo *, RegSeqInfo *, - std::vector > &Remap) const; - bool tryMergeUsingCommonSlot(RegSeqInfo &RSI, RegSeqInfo &CompatibleRSI, - std::vector > &RemapChan); - bool tryMergeUsingFreeSlot(RegSeqInfo &RSI, RegSeqInfo &CompatibleRSI, - std::vector > &RemapChan); - MachineInstr *RebuildVector(RegSeqInfo *MI, - const RegSeqInfo *BaseVec, - const std::vector > &RemapChan) const; - void RemoveMI(MachineInstr *); - void trackRSI(const RegSeqInfo &RSI); - - typedef DenseMap > InstructionSetMap; - DenseMap PreviousRegSeq; - InstructionSetMap PreviousRegSeqByReg; - InstructionSetMap PreviousRegSeqByUndefCount; -public: - static char ID; - R600VectorRegMerger(TargetMachine &tm) : MachineFunctionPass(ID), - TII(nullptr) { } - - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.setPreservesCFG(); - AU.addRequired(); - AU.addPreserved(); - AU.addRequired(); - AU.addPreserved(); - MachineFunctionPass::getAnalysisUsage(AU); - } - - const char *getPassName() const override { - return "R600 Vector Registers Merge Pass"; - } - - bool runOnMachineFunction(MachineFunction &Fn) override; -}; - -char R600VectorRegMerger::ID = 0; - -bool R600VectorRegMerger::canSwizzle(const MachineInstr &MI) - const { - if (TII->get(MI.getOpcode()).TSFlags & R600_InstFlag::TEX_INST) - return true; - switch (MI.getOpcode()) { - case AMDGPU::R600_ExportSwz: - case AMDGPU::EG_ExportSwz: - return true; - default: - return false; - } -} - -bool R600VectorRegMerger::tryMergeVector(const RegSeqInfo *Untouched, - RegSeqInfo *ToMerge, std::vector< std::pair > &Remap) - const { - unsigned CurrentUndexIdx = 0; - for (DenseMap::iterator It = ToMerge->RegToChan.begin(), - E = ToMerge->RegToChan.end(); It != E; ++It) { - DenseMap::const_iterator PosInUntouched = - Untouched->RegToChan.find((*It).first); - if (PosInUntouched != Untouched->RegToChan.end()) { - Remap.push_back(std::pair - ((*It).second, (*PosInUntouched).second)); - continue; - } - if (CurrentUndexIdx >= Untouched->UndefReg.size()) - return false; - Remap.push_back(std::pair - ((*It).second, Untouched->UndefReg[CurrentUndexIdx++])); - } - - return true; -} - -static -unsigned getReassignedChan( - const std::vector > &RemapChan, - unsigned Chan) { - for (unsigned j = 0, je = RemapChan.size(); j < je; j++) { - if (RemapChan[j].first == Chan) - return RemapChan[j].second; - } - llvm_unreachable("Chan wasn't reassigned"); -} - -MachineInstr *R600VectorRegMerger::RebuildVector( - RegSeqInfo *RSI, const RegSeqInfo *BaseRSI, - const std::vector > &RemapChan) const { - unsigned Reg = RSI->Instr->getOperand(0).getReg(); - MachineBasicBlock::iterator Pos = RSI->Instr; - MachineBasicBlock &MBB = *Pos->getParent(); - DebugLoc DL = Pos->getDebugLoc(); - - unsigned SrcVec = BaseRSI->Instr->getOperand(0).getReg(); - DenseMap UpdatedRegToChan = BaseRSI->RegToChan; - std::vector UpdatedUndef = BaseRSI->UndefReg; - for (DenseMap::iterator It = RSI->RegToChan.begin(), - E = RSI->RegToChan.end(); It != E; ++It) { - unsigned DstReg = MRI->createVirtualRegister(&AMDGPU::R600_Reg128RegClass); - unsigned SubReg = (*It).first; - unsigned Swizzle = (*It).second; - unsigned Chan = getReassignedChan(RemapChan, Swizzle); - - MachineInstr *Tmp = BuildMI(MBB, Pos, DL, TII->get(AMDGPU::INSERT_SUBREG), - DstReg) - .addReg(SrcVec) - .addReg(SubReg) - .addImm(Chan); - UpdatedRegToChan[SubReg] = Chan; - std::vector::iterator ChanPos = - std::find(UpdatedUndef.begin(), UpdatedUndef.end(), Chan); - if (ChanPos != UpdatedUndef.end()) - UpdatedUndef.erase(ChanPos); - assert(std::find(UpdatedUndef.begin(), UpdatedUndef.end(), Chan) == - UpdatedUndef.end() && - "UpdatedUndef shouldn't contain Chan more than once!"); - DEBUG(dbgs() << " ->"; Tmp->dump();); - (void)Tmp; - SrcVec = DstReg; - } - Pos = BuildMI(MBB, Pos, DL, TII->get(AMDGPU::COPY), Reg) - .addReg(SrcVec); - DEBUG(dbgs() << " ->"; Pos->dump();); - - DEBUG(dbgs() << " Updating Swizzle:\n"); - for (MachineRegisterInfo::use_instr_iterator It = MRI->use_instr_begin(Reg), - E = MRI->use_instr_end(); It != E; ++It) { - DEBUG(dbgs() << " ";(*It).dump(); dbgs() << " ->"); - SwizzleInput(*It, RemapChan); - DEBUG((*It).dump()); - } - RSI->Instr->eraseFromParent(); - - // Update RSI - RSI->Instr = Pos; - RSI->RegToChan = UpdatedRegToChan; - RSI->UndefReg = UpdatedUndef; - - return Pos; -} - -void R600VectorRegMerger::RemoveMI(MachineInstr *MI) { - for (InstructionSetMap::iterator It = PreviousRegSeqByReg.begin(), - E = PreviousRegSeqByReg.end(); It != E; ++It) { - std::vector &MIs = (*It).second; - MIs.erase(std::find(MIs.begin(), MIs.end(), MI), MIs.end()); - } - for (InstructionSetMap::iterator It = PreviousRegSeqByUndefCount.begin(), - E = PreviousRegSeqByUndefCount.end(); It != E; ++It) { - std::vector &MIs = (*It).second; - MIs.erase(std::find(MIs.begin(), MIs.end(), MI), MIs.end()); - } -} - -void R600VectorRegMerger::SwizzleInput(MachineInstr &MI, - const std::vector > &RemapChan) const { - unsigned Offset; - if (TII->get(MI.getOpcode()).TSFlags & R600_InstFlag::TEX_INST) - Offset = 2; - else - Offset = 3; - for (unsigned i = 0; i < 4; i++) { - unsigned Swizzle = MI.getOperand(i + Offset).getImm() + 1; - for (unsigned j = 0, e = RemapChan.size(); j < e; j++) { - if (RemapChan[j].first == Swizzle) { - MI.getOperand(i + Offset).setImm(RemapChan[j].second - 1); - break; - } - } - } -} - -bool R600VectorRegMerger::areAllUsesSwizzeable(unsigned Reg) const { - for (MachineRegisterInfo::use_instr_iterator It = MRI->use_instr_begin(Reg), - E = MRI->use_instr_end(); It != E; ++It) { - if (!canSwizzle(*It)) - return false; - } - return true; -} - -bool R600VectorRegMerger::tryMergeUsingCommonSlot(RegSeqInfo &RSI, - RegSeqInfo &CompatibleRSI, - std::vector > &RemapChan) { - for (MachineInstr::mop_iterator MOp = RSI.Instr->operands_begin(), - MOE = RSI.Instr->operands_end(); MOp != MOE; ++MOp) { - if (!MOp->isReg()) - continue; - if (PreviousRegSeqByReg[MOp->getReg()].empty()) - continue; - for (MachineInstr *MI : PreviousRegSeqByReg[MOp->getReg()]) { - CompatibleRSI = PreviousRegSeq[MI]; - if (RSI == CompatibleRSI) - continue; - if (tryMergeVector(&CompatibleRSI, &RSI, RemapChan)) - return true; - } - } - return false; -} - -bool R600VectorRegMerger::tryMergeUsingFreeSlot(RegSeqInfo &RSI, - RegSeqInfo &CompatibleRSI, - std::vector > &RemapChan) { - unsigned NeededUndefs = 4 - RSI.UndefReg.size(); - if (PreviousRegSeqByUndefCount[NeededUndefs].empty()) - return false; - std::vector &MIs = - PreviousRegSeqByUndefCount[NeededUndefs]; - CompatibleRSI = PreviousRegSeq[MIs.back()]; - tryMergeVector(&CompatibleRSI, &RSI, RemapChan); - return true; -} - -void R600VectorRegMerger::trackRSI(const RegSeqInfo &RSI) { - for (DenseMap::const_iterator - It = RSI.RegToChan.begin(), E = RSI.RegToChan.end(); It != E; ++It) { - PreviousRegSeqByReg[(*It).first].push_back(RSI.Instr); - } - PreviousRegSeqByUndefCount[RSI.UndefReg.size()].push_back(RSI.Instr); - PreviousRegSeq[RSI.Instr] = RSI; -} - -bool R600VectorRegMerger::runOnMachineFunction(MachineFunction &Fn) { - TII = static_cast(Fn.getSubtarget().getInstrInfo()); - MRI = &(Fn.getRegInfo()); - for (MachineFunction::iterator MBB = Fn.begin(), MBBe = Fn.end(); - MBB != MBBe; ++MBB) { - MachineBasicBlock *MB = MBB; - PreviousRegSeq.clear(); - PreviousRegSeqByReg.clear(); - PreviousRegSeqByUndefCount.clear(); - - for (MachineBasicBlock::iterator MII = MB->begin(), MIIE = MB->end(); - MII != MIIE; ++MII) { - MachineInstr *MI = MII; - if (MI->getOpcode() != AMDGPU::REG_SEQUENCE) { - if (TII->get(MI->getOpcode()).TSFlags & R600_InstFlag::TEX_INST) { - unsigned Reg = MI->getOperand(1).getReg(); - for (MachineRegisterInfo::def_instr_iterator - It = MRI->def_instr_begin(Reg), E = MRI->def_instr_end(); - It != E; ++It) { - RemoveMI(&(*It)); - } - } - continue; - } - - - RegSeqInfo RSI(*MRI, MI); - - // All uses of MI are swizzeable ? - unsigned Reg = MI->getOperand(0).getReg(); - if (!areAllUsesSwizzeable(Reg)) - continue; - - DEBUG (dbgs() << "Trying to optimize "; - MI->dump(); - ); - - RegSeqInfo CandidateRSI; - std::vector > RemapChan; - DEBUG(dbgs() << "Using common slots...\n";); - if (tryMergeUsingCommonSlot(RSI, CandidateRSI, RemapChan)) { - // Remove CandidateRSI mapping - RemoveMI(CandidateRSI.Instr); - MII = RebuildVector(&RSI, &CandidateRSI, RemapChan); - trackRSI(RSI); - continue; - } - DEBUG(dbgs() << "Using free slots...\n";); - RemapChan.clear(); - if (tryMergeUsingFreeSlot(RSI, CandidateRSI, RemapChan)) { - RemoveMI(CandidateRSI.Instr); - MII = RebuildVector(&RSI, &CandidateRSI, RemapChan); - trackRSI(RSI); - continue; - } - //Failed to merge - trackRSI(RSI); - } - } - return false; -} - -} - -llvm::FunctionPass *llvm::createR600VectorRegMerger(TargetMachine &tm) { - return new R600VectorRegMerger(tm); -} Index: projects/clang370-import/contrib/llvm/lib/Target/R600/SIFixSGPRCopies.cpp =================================================================== --- projects/clang370-import/contrib/llvm/lib/Target/R600/SIFixSGPRCopies.cpp (revision 288077) +++ projects/clang370-import/contrib/llvm/lib/Target/R600/SIFixSGPRCopies.cpp (nonexistent) @@ -1,338 +0,0 @@ -//===-- SIFixSGPRCopies.cpp - Remove potential VGPR => SGPR copies --------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -/// \file -/// Copies from VGPR to SGPR registers are illegal and the register coalescer -/// will sometimes generate these illegal copies in situations like this: -/// -/// Register Class is the union of and -/// -/// BB0: -/// %vreg0 = SCALAR_INST -/// %vreg1 = COPY %vreg0 -/// ... -/// BRANCH %cond BB1, BB2 -/// BB1: -/// %vreg2 = VECTOR_INST -/// %vreg3 = COPY %vreg2 -/// BB2: -/// %vreg4 = PHI %vreg1 , , %vreg3 , -/// %vreg5 = VECTOR_INST %vreg4 -/// -/// -/// The coalescer will begin at BB0 and eliminate its copy, then the resulting -/// code will look like this: -/// -/// BB0: -/// %vreg0 = SCALAR_INST -/// ... -/// BRANCH %cond BB1, BB2 -/// BB1: -/// %vreg2 = VECTOR_INST -/// %vreg3 = COPY %vreg2 -/// BB2: -/// %vreg4 = PHI %vreg0 , , %vreg3 , -/// %vreg5 = VECTOR_INST %vreg4 -/// -/// Now that the result of the PHI instruction is an SGPR, the register -/// allocator is now forced to constrain the register class of %vreg3 to -/// so we end up with final code like this: -/// -/// BB0: -/// %vreg0 = SCALAR_INST -/// ... -/// BRANCH %cond BB1, BB2 -/// BB1: -/// %vreg2 = VECTOR_INST -/// %vreg3 = COPY %vreg2 -/// BB2: -/// %vreg4 = PHI %vreg0 , , %vreg3 , -/// %vreg5 = VECTOR_INST %vreg4 -/// -/// Now this code contains an illegal copy from a VGPR to an SGPR. -/// -/// In order to avoid this problem, this pass searches for PHI instructions -/// which define a register and constrains its definition class to -/// if the user of the PHI's definition register is a vector instruction. -/// If the PHI's definition class is constrained to then the coalescer -/// will be unable to perform the COPY removal from the above example which -/// ultimately led to the creation of an illegal COPY. -//===----------------------------------------------------------------------===// - -#include "AMDGPU.h" -#include "AMDGPUSubtarget.h" -#include "SIInstrInfo.h" -#include "llvm/CodeGen/MachineFunctionPass.h" -#include "llvm/CodeGen/MachineInstrBuilder.h" -#include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/Support/Debug.h" -#include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetMachine.h" - -using namespace llvm; - -#define DEBUG_TYPE "sgpr-copies" - -namespace { - -class SIFixSGPRCopies : public MachineFunctionPass { - -private: - static char ID; - const TargetRegisterClass *inferRegClassFromUses(const SIRegisterInfo *TRI, - const MachineRegisterInfo &MRI, - unsigned Reg, - unsigned SubReg) const; - const TargetRegisterClass *inferRegClassFromDef(const SIRegisterInfo *TRI, - const MachineRegisterInfo &MRI, - unsigned Reg, - unsigned SubReg) const; - bool isVGPRToSGPRCopy(const MachineInstr &Copy, const SIRegisterInfo *TRI, - const MachineRegisterInfo &MRI) const; - -public: - SIFixSGPRCopies(TargetMachine &tm) : MachineFunctionPass(ID) { } - - bool runOnMachineFunction(MachineFunction &MF) override; - - const char *getPassName() const override { - return "SI Fix SGPR copies"; - } - -}; - -} // End anonymous namespace - -char SIFixSGPRCopies::ID = 0; - -FunctionPass *llvm::createSIFixSGPRCopiesPass(TargetMachine &tm) { - return new SIFixSGPRCopies(tm); -} - -static bool hasVGPROperands(const MachineInstr &MI, const SIRegisterInfo *TRI) { - const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); - for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { - if (!MI.getOperand(i).isReg() || - !TargetRegisterInfo::isVirtualRegister(MI.getOperand(i).getReg())) - continue; - - if (TRI->hasVGPRs(MRI.getRegClass(MI.getOperand(i).getReg()))) - return true; - } - return false; -} - -/// This functions walks the use list of Reg until it finds an Instruction -/// that isn't a COPY returns the register class of that instruction. -/// \return The register defined by the first non-COPY instruction. -const TargetRegisterClass *SIFixSGPRCopies::inferRegClassFromUses( - const SIRegisterInfo *TRI, - const MachineRegisterInfo &MRI, - unsigned Reg, - unsigned SubReg) const { - - const TargetRegisterClass *RC - = TargetRegisterInfo::isVirtualRegister(Reg) ? - MRI.getRegClass(Reg) : - TRI->getPhysRegClass(Reg); - - RC = TRI->getSubRegClass(RC, SubReg); - for (MachineRegisterInfo::use_instr_iterator - I = MRI.use_instr_begin(Reg), E = MRI.use_instr_end(); I != E; ++I) { - switch (I->getOpcode()) { - case AMDGPU::COPY: - RC = TRI->getCommonSubClass(RC, inferRegClassFromUses(TRI, MRI, - I->getOperand(0).getReg(), - I->getOperand(0).getSubReg())); - break; - } - } - - return RC; -} - -const TargetRegisterClass *SIFixSGPRCopies::inferRegClassFromDef( - const SIRegisterInfo *TRI, - const MachineRegisterInfo &MRI, - unsigned Reg, - unsigned SubReg) const { - if (!TargetRegisterInfo::isVirtualRegister(Reg)) { - const TargetRegisterClass *RC = TRI->getPhysRegClass(Reg); - return TRI->getSubRegClass(RC, SubReg); - } - MachineInstr *Def = MRI.getVRegDef(Reg); - if (Def->getOpcode() != AMDGPU::COPY) { - return TRI->getSubRegClass(MRI.getRegClass(Reg), SubReg); - } - - return inferRegClassFromDef(TRI, MRI, Def->getOperand(1).getReg(), - Def->getOperand(1).getSubReg()); -} - -bool SIFixSGPRCopies::isVGPRToSGPRCopy(const MachineInstr &Copy, - const SIRegisterInfo *TRI, - const MachineRegisterInfo &MRI) const { - - unsigned DstReg = Copy.getOperand(0).getReg(); - unsigned SrcReg = Copy.getOperand(1).getReg(); - unsigned SrcSubReg = Copy.getOperand(1).getSubReg(); - - if (!TargetRegisterInfo::isVirtualRegister(DstReg)) { - // If the destination register is a physical register there isn't really - // much we can do to fix this. - return false; - } - - const TargetRegisterClass *DstRC = MRI.getRegClass(DstReg); - - const TargetRegisterClass *SrcRC; - - if (!TargetRegisterInfo::isVirtualRegister(SrcReg) || - MRI.getRegClass(SrcReg) == &AMDGPU::VReg_1RegClass) - return false; - - SrcRC = TRI->getSubRegClass(MRI.getRegClass(SrcReg), SrcSubReg); - return TRI->isSGPRClass(DstRC) && TRI->hasVGPRs(SrcRC); -} - -bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) { - MachineRegisterInfo &MRI = MF.getRegInfo(); - const SIRegisterInfo *TRI = - static_cast(MF.getSubtarget().getRegisterInfo()); - const SIInstrInfo *TII = - static_cast(MF.getSubtarget().getInstrInfo()); - for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); - BI != BE; ++BI) { - - MachineBasicBlock &MBB = *BI; - for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); - I != E; ++I) { - MachineInstr &MI = *I; - if (MI.getOpcode() == AMDGPU::COPY && isVGPRToSGPRCopy(MI, TRI, MRI)) { - DEBUG(dbgs() << "Fixing VGPR -> SGPR copy:\n"); - DEBUG(MI.print(dbgs())); - TII->moveToVALU(MI); - - } - - switch (MI.getOpcode()) { - default: continue; - case AMDGPU::PHI: { - DEBUG(dbgs() << "Fixing PHI: " << MI); - - for (unsigned i = 1; i < MI.getNumOperands(); i += 2) { - const MachineOperand &Op = MI.getOperand(i); - unsigned Reg = Op.getReg(); - const TargetRegisterClass *RC - = inferRegClassFromDef(TRI, MRI, Reg, Op.getSubReg()); - - MRI.constrainRegClass(Op.getReg(), RC); - } - unsigned Reg = MI.getOperand(0).getReg(); - const TargetRegisterClass *RC = inferRegClassFromUses(TRI, MRI, Reg, - MI.getOperand(0).getSubReg()); - if (TRI->getCommonSubClass(RC, &AMDGPU::VGPR_32RegClass)) { - MRI.constrainRegClass(Reg, &AMDGPU::VGPR_32RegClass); - } - - if (!TRI->isSGPRClass(MRI.getRegClass(Reg))) - break; - - // If a PHI node defines an SGPR and any of its operands are VGPRs, - // then we need to move it to the VALU. - // - // Also, if a PHI node defines an SGPR and has all SGPR operands - // we must move it to the VALU, because the SGPR operands will - // all end up being assigned the same register, which means - // there is a potential for a conflict if different threads take - // different control flow paths. - // - // For Example: - // - // sgpr0 = def; - // ... - // sgpr1 = def; - // ... - // sgpr2 = PHI sgpr0, sgpr1 - // use sgpr2; - // - // Will Become: - // - // sgpr2 = def; - // ... - // sgpr2 = def; - // ... - // use sgpr2 - // - // FIXME: This is OK if the branching decision is made based on an - // SGPR value. - bool SGPRBranch = false; - - // The one exception to this rule is when one of the operands - // is defined by a SI_BREAK, SI_IF_BREAK, or SI_ELSE_BREAK - // instruction. In this case, there we know the program will - // never enter the second block (the loop) without entering - // the first block (where the condition is computed), so there - // is no chance for values to be over-written. - - bool HasBreakDef = false; - for (unsigned i = 1; i < MI.getNumOperands(); i+=2) { - unsigned Reg = MI.getOperand(i).getReg(); - if (TRI->hasVGPRs(MRI.getRegClass(Reg))) { - TII->moveToVALU(MI); - break; - } - MachineInstr *DefInstr = MRI.getUniqueVRegDef(Reg); - assert(DefInstr); - switch(DefInstr->getOpcode()) { - - case AMDGPU::SI_BREAK: - case AMDGPU::SI_IF_BREAK: - case AMDGPU::SI_ELSE_BREAK: - // If we see a PHI instruction that defines an SGPR, then that PHI - // instruction has already been considered and should have - // a *_BREAK as an operand. - case AMDGPU::PHI: - HasBreakDef = true; - break; - } - } - - if (!SGPRBranch && !HasBreakDef) - TII->moveToVALU(MI); - break; - } - case AMDGPU::REG_SEQUENCE: { - if (TRI->hasVGPRs(TII->getOpRegClass(MI, 0)) || - !hasVGPROperands(MI, TRI)) - continue; - - DEBUG(dbgs() << "Fixing REG_SEQUENCE: " << MI); - - TII->moveToVALU(MI); - break; - } - case AMDGPU::INSERT_SUBREG: { - const TargetRegisterClass *DstRC, *Src0RC, *Src1RC; - DstRC = MRI.getRegClass(MI.getOperand(0).getReg()); - Src0RC = MRI.getRegClass(MI.getOperand(1).getReg()); - Src1RC = MRI.getRegClass(MI.getOperand(2).getReg()); - if (TRI->isSGPRClass(DstRC) && - (TRI->hasVGPRs(Src0RC) || TRI->hasVGPRs(Src1RC))) { - DEBUG(dbgs() << " Fixing INSERT_SUBREG: " << MI); - TII->moveToVALU(MI); - } - break; - } - } - } - } - - return true; -} Index: projects/clang370-import/contrib/llvm/lib/Target/R600/AMDGPUTargetMachine.h =================================================================== --- projects/clang370-import/contrib/llvm/lib/Target/R600/AMDGPUTargetMachine.h (revision 288077) +++ projects/clang370-import/contrib/llvm/lib/Target/R600/AMDGPUTargetMachine.h (nonexistent) @@ -1,89 +0,0 @@ -//===-- AMDGPUTargetMachine.h - AMDGPU TargetMachine Interface --*- C++ -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -/// \file -/// \brief The AMDGPU TargetMachine interface definition for hw codgen targets. -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIB_TARGET_R600_AMDGPUTARGETMACHINE_H -#define LLVM_LIB_TARGET_R600_AMDGPUTARGETMACHINE_H - -#include "AMDGPUFrameLowering.h" -#include "AMDGPUInstrInfo.h" -#include "AMDGPUIntrinsicInfo.h" -#include "AMDGPUSubtarget.h" -#include "R600ISelLowering.h" -#include "llvm/IR/DataLayout.h" - -namespace llvm { - -//===----------------------------------------------------------------------===// -// AMDGPU Target Machine (R600+) -//===----------------------------------------------------------------------===// - -class AMDGPUTargetMachine : public LLVMTargetMachine { -private: - -protected: - TargetLoweringObjectFile *TLOF; - AMDGPUSubtarget Subtarget; - AMDGPUIntrinsicInfo IntrinsicInfo; - -public: - AMDGPUTargetMachine(const Target &T, StringRef TT, StringRef FS, - StringRef CPU, TargetOptions Options, Reloc::Model RM, - CodeModel::Model CM, CodeGenOpt::Level OL); - ~AMDGPUTargetMachine(); - - const AMDGPUSubtarget *getSubtargetImpl() const { return &Subtarget; } - const AMDGPUSubtarget *getSubtargetImpl(const Function &) const override { - return &Subtarget; - } - const AMDGPUIntrinsicInfo *getIntrinsicInfo() const override { - return &IntrinsicInfo; - } - TargetIRAnalysis getTargetIRAnalysis() override; - - TargetLoweringObjectFile *getObjFileLowering() const override { - return TLOF; - } -}; - -//===----------------------------------------------------------------------===// -// R600 Target Machine (R600 -> Cayman) -//===----------------------------------------------------------------------===// - -class R600TargetMachine : public AMDGPUTargetMachine { - -public: - R600TargetMachine(const Target &T, StringRef TT, StringRef FS, - StringRef CPU, TargetOptions Options, Reloc::Model RM, - CodeModel::Model CM, CodeGenOpt::Level OL); - - TargetPassConfig *createPassConfig(PassManagerBase &PM) override; -}; - -//===----------------------------------------------------------------------===// -// GCN Target Machine (SI+) -//===----------------------------------------------------------------------===// - -class GCNTargetMachine : public AMDGPUTargetMachine { - -public: - GCNTargetMachine(const Target &T, StringRef TT, StringRef FS, - StringRef CPU, TargetOptions Options, Reloc::Model RM, - CodeModel::Model CM, CodeGenOpt::Level OL); - - TargetPassConfig *createPassConfig(PassManagerBase &PM) override; -}; - -} // End namespace llvm - -#endif Index: projects/clang370-import/contrib/llvm/lib/Target/R600/AMDGPUCallingConv.td =================================================================== --- projects/clang370-import/contrib/llvm/lib/Target/R600/AMDGPUCallingConv.td (revision 288077) +++ projects/clang370-import/contrib/llvm/lib/Target/R600/AMDGPUCallingConv.td (nonexistent) @@ -1,82 +0,0 @@ -//===---- AMDCallingConv.td - Calling Conventions for Radeon GPUs ---------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This describes the calling conventions for the AMD Radeon GPUs. -// -//===----------------------------------------------------------------------===// - -// Inversion of CCIfInReg -class CCIfNotInReg : CCIf<"!ArgFlags.isInReg()", A> {} - -// Calling convention for SI -def CC_SI : CallingConv<[ - - CCIfInReg>>, - - CCIfInReg>>, - - CCIfNotInReg>>, - - CCIfByVal>> - -]>; - -// Calling convention for R600 -def CC_R600 : CallingConv<[ - CCIfInReg>> -]>; - -// Calling convention for compute kernels -def CC_AMDGPU_Kernel : CallingConv<[ - CCCustom<"allocateStack"> -]>; - -def CC_AMDGPU : CallingConv<[ - CCIf<"static_cast" - "(State.getMachineFunction().getSubtarget()).getGeneration() >=" - "AMDGPUSubtarget::SOUTHERN_ISLANDS && " - "State.getMachineFunction().getInfo()" - "->getShaderType() == ShaderType::COMPUTE", - CCDelegateTo>, - CCIf<"static_cast" - "(State.getMachineFunction().getSubtarget()).getGeneration() < " - "AMDGPUSubtarget::SOUTHERN_ISLANDS && " - "State.getMachineFunction().getInfo()" - "->getShaderType() == ShaderType::COMPUTE", - CCDelegateTo>, - CCIf<"static_cast" - "(State.getMachineFunction().getSubtarget()).getGeneration() >= " - "AMDGPUSubtarget::SOUTHERN_ISLANDS", - CCDelegateTo>, - CCIf<"static_cast" - "(State.getMachineFunction().getSubtarget()).getGeneration() < " - "AMDGPUSubtarget::SOUTHERN_ISLANDS", - CCDelegateTo> -]>; Index: projects/clang370-import/contrib/llvm/lib/Target/R600/R600InstrFormats.td =================================================================== --- projects/clang370-import/contrib/llvm/lib/Target/R600/R600InstrFormats.td (revision 288077) +++ projects/clang370-import/contrib/llvm/lib/Target/R600/R600InstrFormats.td (nonexistent) @@ -1,495 +0,0 @@ -//===-- R600InstrFormats.td - R600 Instruction Encodings ------------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// R600 Instruction format definitions. -// -//===----------------------------------------------------------------------===// - -class InstR600 pattern, - InstrItinClass itin> - : AMDGPUInst { - - field bits<64> Inst; - bit Trig = 0; - bit Op3 = 0; - bit isVector = 0; - bits<2> FlagOperandIdx = 0; - bit Op1 = 0; - bit Op2 = 0; - bit LDS_1A = 0; - bit LDS_1A1D = 0; - bit HasNativeOperands = 0; - bit VTXInst = 0; - bit TEXInst = 0; - bit ALUInst = 0; - bit IsExport = 0; - bit LDS_1A2D = 0; - - let Namespace = "AMDGPU"; - let OutOperandList = outs; - let InOperandList = ins; - let AsmString = asm; - let Pattern = pattern; - let Itinerary = itin; - - // No AsmMatcher support. - let isCodeGenOnly = 1; - - let TSFlags{4} = Trig; - let TSFlags{5} = Op3; - - // Vector instructions are instructions that must fill all slots in an - // instruction group - let TSFlags{6} = isVector; - let TSFlags{8-7} = FlagOperandIdx; - let TSFlags{9} = HasNativeOperands; - let TSFlags{10} = Op1; - let TSFlags{11} = Op2; - let TSFlags{12} = VTXInst; - let TSFlags{13} = TEXInst; - let TSFlags{14} = ALUInst; - let TSFlags{15} = LDS_1A; - let TSFlags{16} = LDS_1A1D; - let TSFlags{17} = IsExport; - let TSFlags{18} = LDS_1A2D; -} - -//===----------------------------------------------------------------------===// -// ALU instructions -//===----------------------------------------------------------------------===// - -class R600_ALU_LDS_Word0 { - field bits<32> Word0; - - bits<11> src0; - bits<1> src0_rel; - bits<11> src1; - bits<1> src1_rel; - bits<3> index_mode = 0; - bits<2> pred_sel; - bits<1> last; - - bits<9> src0_sel = src0{8-0}; - bits<2> src0_chan = src0{10-9}; - bits<9> src1_sel = src1{8-0}; - bits<2> src1_chan = src1{10-9}; - - let Word0{8-0} = src0_sel; - let Word0{9} = src0_rel; - let Word0{11-10} = src0_chan; - let Word0{21-13} = src1_sel; - let Word0{22} = src1_rel; - let Word0{24-23} = src1_chan; - let Word0{28-26} = index_mode; - let Word0{30-29} = pred_sel; - let Word0{31} = last; -} - -class R600ALU_Word0 : R600_ALU_LDS_Word0 { - - bits<1> src0_neg; - bits<1> src1_neg; - - let Word0{12} = src0_neg; - let Word0{25} = src1_neg; -} - -class R600ALU_Word1 { - field bits<32> Word1; - - bits<11> dst; - bits<3> bank_swizzle; - bits<1> dst_rel; - bits<1> clamp; - - bits<7> dst_sel = dst{6-0}; - bits<2> dst_chan = dst{10-9}; - - let Word1{20-18} = bank_swizzle; - let Word1{27-21} = dst_sel; - let Word1{28} = dst_rel; - let Word1{30-29} = dst_chan; - let Word1{31} = clamp; -} - -class R600ALU_Word1_OP2 alu_inst> : R600ALU_Word1{ - - bits<1> src0_abs; - bits<1> src1_abs; - bits<1> update_exec_mask; - bits<1> update_pred; - bits<1> write; - bits<2> omod; - - let Word1{0} = src0_abs; - let Word1{1} = src1_abs; - let Word1{2} = update_exec_mask; - let Word1{3} = update_pred; - let Word1{4} = write; - let Word1{6-5} = omod; - let Word1{17-7} = alu_inst; -} - -class R600ALU_Word1_OP3 alu_inst> : R600ALU_Word1{ - - bits<11> src2; - bits<1> src2_rel; - bits<1> src2_neg; - - bits<9> src2_sel = src2{8-0}; - bits<2> src2_chan = src2{10-9}; - - let Word1{8-0} = src2_sel; - let Word1{9} = src2_rel; - let Word1{11-10} = src2_chan; - let Word1{12} = src2_neg; - let Word1{17-13} = alu_inst; -} - -class R600LDS_Word1 { - field bits<32> Word1; - - bits<11> src2; - bits<9> src2_sel = src2{8-0}; - bits<2> src2_chan = src2{10-9}; - bits<1> src2_rel; - // offset specifies the stride offset to the second set of data to be read - // from. This is a dword offset. - bits<5> alu_inst = 17; // OP3_INST_LDS_IDX_OP - bits<3> bank_swizzle; - bits<6> lds_op; - bits<2> dst_chan = 0; - - let Word1{8-0} = src2_sel; - let Word1{9} = src2_rel; - let Word1{11-10} = src2_chan; - let Word1{17-13} = alu_inst; - let Word1{20-18} = bank_swizzle; - let Word1{26-21} = lds_op; - let Word1{30-29} = dst_chan; -} - - -/* -XXX: R600 subtarget uses a slightly different encoding than the other -subtargets. We currently handle this in R600MCCodeEmitter, but we may -want to use these instruction classes in the future. - -class R600ALU_Word1_OP2_r600 : R600ALU_Word1_OP2 { - - bits<1> fog_merge; - bits<10> alu_inst; - - let Inst{37} = fog_merge; - let Inst{39-38} = omod; - let Inst{49-40} = alu_inst; -} - -class R600ALU_Word1_OP2_r700 : R600ALU_Word1_OP2 { - - bits<11> alu_inst; - - let Inst{38-37} = omod; - let Inst{49-39} = alu_inst; -} -*/ - -//===----------------------------------------------------------------------===// -// Vertex Fetch instructions -//===----------------------------------------------------------------------===// - -class VTX_WORD0 { - field bits<32> Word0; - bits<7> src_gpr; - bits<5> VC_INST; - bits<2> FETCH_TYPE; - bits<1> FETCH_WHOLE_QUAD; - bits<8> BUFFER_ID; - bits<1> SRC_REL; - bits<2> SRC_SEL_X; - - let Word0{4-0} = VC_INST; - let Word0{6-5} = FETCH_TYPE; - let Word0{7} = FETCH_WHOLE_QUAD; - let Word0{15-8} = BUFFER_ID; - let Word0{22-16} = src_gpr; - let Word0{23} = SRC_REL; - let Word0{25-24} = SRC_SEL_X; -} - -class VTX_WORD0_eg : VTX_WORD0 { - - bits<6> MEGA_FETCH_COUNT; - - let Word0{31-26} = MEGA_FETCH_COUNT; -} - -class VTX_WORD0_cm : VTX_WORD0 { - - bits<2> SRC_SEL_Y; - bits<2> STRUCTURED_READ; - bits<1> LDS_REQ; - bits<1> COALESCED_READ; - - let Word0{27-26} = SRC_SEL_Y; - let Word0{29-28} = STRUCTURED_READ; - let Word0{30} = LDS_REQ; - let Word0{31} = COALESCED_READ; -} - -class VTX_WORD1_GPR { - field bits<32> Word1; - bits<7> dst_gpr; - bits<1> DST_REL; - bits<3> DST_SEL_X; - bits<3> DST_SEL_Y; - bits<3> DST_SEL_Z; - bits<3> DST_SEL_W; - bits<1> USE_CONST_FIELDS; - bits<6> DATA_FORMAT; - bits<2> NUM_FORMAT_ALL; - bits<1> FORMAT_COMP_ALL; - bits<1> SRF_MODE_ALL; - - let Word1{6-0} = dst_gpr; - let Word1{7} = DST_REL; - let Word1{8} = 0; // Reserved - let Word1{11-9} = DST_SEL_X; - let Word1{14-12} = DST_SEL_Y; - let Word1{17-15} = DST_SEL_Z; - let Word1{20-18} = DST_SEL_W; - let Word1{21} = USE_CONST_FIELDS; - let Word1{27-22} = DATA_FORMAT; - let Word1{29-28} = NUM_FORMAT_ALL; - let Word1{30} = FORMAT_COMP_ALL; - let Word1{31} = SRF_MODE_ALL; -} - -//===----------------------------------------------------------------------===// -// Texture fetch instructions -//===----------------------------------------------------------------------===// - -class TEX_WORD0 { - field bits<32> Word0; - - bits<5> TEX_INST; - bits<2> INST_MOD; - bits<1> FETCH_WHOLE_QUAD; - bits<8> RESOURCE_ID; - bits<7> SRC_GPR; - bits<1> SRC_REL; - bits<1> ALT_CONST; - bits<2> RESOURCE_INDEX_MODE; - bits<2> SAMPLER_INDEX_MODE; - - let Word0{4-0} = TEX_INST; - let Word0{6-5} = INST_MOD; - let Word0{7} = FETCH_WHOLE_QUAD; - let Word0{15-8} = RESOURCE_ID; - let Word0{22-16} = SRC_GPR; - let Word0{23} = SRC_REL; - let Word0{24} = ALT_CONST; - let Word0{26-25} = RESOURCE_INDEX_MODE; - let Word0{28-27} = SAMPLER_INDEX_MODE; -} - -class TEX_WORD1 { - field bits<32> Word1; - - bits<7> DST_GPR; - bits<1> DST_REL; - bits<3> DST_SEL_X; - bits<3> DST_SEL_Y; - bits<3> DST_SEL_Z; - bits<3> DST_SEL_W; - bits<7> LOD_BIAS; - bits<1> COORD_TYPE_X; - bits<1> COORD_TYPE_Y; - bits<1> COORD_TYPE_Z; - bits<1> COORD_TYPE_W; - - let Word1{6-0} = DST_GPR; - let Word1{7} = DST_REL; - let Word1{11-9} = DST_SEL_X; - let Word1{14-12} = DST_SEL_Y; - let Word1{17-15} = DST_SEL_Z; - let Word1{20-18} = DST_SEL_W; - let Word1{27-21} = LOD_BIAS; - let Word1{28} = COORD_TYPE_X; - let Word1{29} = COORD_TYPE_Y; - let Word1{30} = COORD_TYPE_Z; - let Word1{31} = COORD_TYPE_W; -} - -class TEX_WORD2 { - field bits<32> Word2; - - bits<5> OFFSET_X; - bits<5> OFFSET_Y; - bits<5> OFFSET_Z; - bits<5> SAMPLER_ID; - bits<3> SRC_SEL_X; - bits<3> SRC_SEL_Y; - bits<3> SRC_SEL_Z; - bits<3> SRC_SEL_W; - - let Word2{4-0} = OFFSET_X; - let Word2{9-5} = OFFSET_Y; - let Word2{14-10} = OFFSET_Z; - let Word2{19-15} = SAMPLER_ID; - let Word2{22-20} = SRC_SEL_X; - let Word2{25-23} = SRC_SEL_Y; - let Word2{28-26} = SRC_SEL_Z; - let Word2{31-29} = SRC_SEL_W; -} - -//===----------------------------------------------------------------------===// -// Control Flow Instructions -//===----------------------------------------------------------------------===// - -class CF_WORD1_R600 { - field bits<32> Word1; - - bits<3> POP_COUNT; - bits<5> CF_CONST; - bits<2> COND; - bits<3> COUNT; - bits<6> CALL_COUNT; - bits<1> COUNT_3; - bits<1> END_OF_PROGRAM; - bits<1> VALID_PIXEL_MODE; - bits<7> CF_INST; - bits<1> WHOLE_QUAD_MODE; - bits<1> BARRIER; - - let Word1{2-0} = POP_COUNT; - let Word1{7-3} = CF_CONST; - let Word1{9-8} = COND; - let Word1{12-10} = COUNT; - let Word1{18-13} = CALL_COUNT; - let Word1{19} = COUNT_3; - let Word1{21} = END_OF_PROGRAM; - let Word1{22} = VALID_PIXEL_MODE; - let Word1{29-23} = CF_INST; - let Word1{30} = WHOLE_QUAD_MODE; - let Word1{31} = BARRIER; -} - -class CF_WORD0_EG { - field bits<32> Word0; - - bits<24> ADDR; - bits<3> JUMPTABLE_SEL; - - let Word0{23-0} = ADDR; - let Word0{26-24} = JUMPTABLE_SEL; -} - -class CF_WORD1_EG { - field bits<32> Word1; - - bits<3> POP_COUNT; - bits<5> CF_CONST; - bits<2> COND; - bits<6> COUNT; - bits<1> VALID_PIXEL_MODE; - bits<1> END_OF_PROGRAM; - bits<8> CF_INST; - bits<1> BARRIER; - - let Word1{2-0} = POP_COUNT; - let Word1{7-3} = CF_CONST; - let Word1{9-8} = COND; - let Word1{15-10} = COUNT; - let Word1{20} = VALID_PIXEL_MODE; - let Word1{21} = END_OF_PROGRAM; - let Word1{29-22} = CF_INST; - let Word1{31} = BARRIER; -} - -class CF_ALU_WORD0 { - field bits<32> Word0; - - bits<22> ADDR; - bits<4> KCACHE_BANK0; - bits<4> KCACHE_BANK1; - bits<2> KCACHE_MODE0; - - let Word0{21-0} = ADDR; - let Word0{25-22} = KCACHE_BANK0; - let Word0{29-26} = KCACHE_BANK1; - let Word0{31-30} = KCACHE_MODE0; -} - -class CF_ALU_WORD1 { - field bits<32> Word1; - - bits<2> KCACHE_MODE1; - bits<8> KCACHE_ADDR0; - bits<8> KCACHE_ADDR1; - bits<7> COUNT; - bits<1> ALT_CONST; - bits<4> CF_INST; - bits<1> WHOLE_QUAD_MODE; - bits<1> BARRIER; - - let Word1{1-0} = KCACHE_MODE1; - let Word1{9-2} = KCACHE_ADDR0; - let Word1{17-10} = KCACHE_ADDR1; - let Word1{24-18} = COUNT; - let Word1{25} = ALT_CONST; - let Word1{29-26} = CF_INST; - let Word1{30} = WHOLE_QUAD_MODE; - let Word1{31} = BARRIER; -} - -class CF_ALLOC_EXPORT_WORD0_RAT { - field bits<32> Word0; - - bits<4> rat_id; - bits<6> rat_inst; - bits<2> rim; - bits<2> type; - bits<7> rw_gpr; - bits<1> rw_rel; - bits<7> index_gpr; - bits<2> elem_size; - - let Word0{3-0} = rat_id; - let Word0{9-4} = rat_inst; - let Word0{10} = 0; // Reserved - let Word0{12-11} = rim; - let Word0{14-13} = type; - let Word0{21-15} = rw_gpr; - let Word0{22} = rw_rel; - let Word0{29-23} = index_gpr; - let Word0{31-30} = elem_size; -} - -class CF_ALLOC_EXPORT_WORD1_BUF { - field bits<32> Word1; - - bits<12> array_size; - bits<4> comp_mask; - bits<4> burst_count; - bits<1> vpm; - bits<1> eop; - bits<8> cf_inst; - bits<1> mark; - bits<1> barrier; - - let Word1{11-0} = array_size; - let Word1{15-12} = comp_mask; - let Word1{19-16} = burst_count; - let Word1{20} = vpm; - let Word1{21} = eop; - let Word1{29-22} = cf_inst; - let Word1{30} = mark; - let Word1{31} = barrier; -} Index: projects/clang370-import/contrib/llvm/lib/Target/R600/AMDGPUTargetTransformInfo.cpp =================================================================== --- projects/clang370-import/contrib/llvm/lib/Target/R600/AMDGPUTargetTransformInfo.cpp (revision 288077) +++ projects/clang370-import/contrib/llvm/lib/Target/R600/AMDGPUTargetTransformInfo.cpp (nonexistent) @@ -1,82 +0,0 @@ -//===-- AMDGPUTargetTransformInfo.cpp - AMDGPU specific TTI pass ---------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// \file -// This file implements a TargetTransformInfo analysis pass specific to the -// AMDGPU target machine. It uses the target's detailed information to provide -// more precise answers to certain TTI queries, while letting the target -// independent and default TTI implementations handle the rest. -// -//===----------------------------------------------------------------------===// - -#include "AMDGPUTargetTransformInfo.h" -#include "llvm/Analysis/LoopInfo.h" -#include "llvm/Analysis/TargetTransformInfo.h" -#include "llvm/Analysis/ValueTracking.h" -#include "llvm/CodeGen/BasicTTIImpl.h" -#include "llvm/IR/Module.h" -#include "llvm/Support/Debug.h" -#include "llvm/Target/CostTable.h" -#include "llvm/Target/TargetLowering.h" -using namespace llvm; - -#define DEBUG_TYPE "AMDGPUtti" - -void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L, - TTI::UnrollingPreferences &UP) { - UP.Threshold = 300; // Twice the default. - UP.MaxCount = UINT_MAX; - UP.Partial = true; - - // TODO: Do we want runtime unrolling? - - for (const BasicBlock *BB : L->getBlocks()) { - const DataLayout &DL = BB->getModule()->getDataLayout(); - for (const Instruction &I : *BB) { - const GetElementPtrInst *GEP = dyn_cast(&I); - if (!GEP || GEP->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) - continue; - - const Value *Ptr = GEP->getPointerOperand(); - const AllocaInst *Alloca = - dyn_cast(GetUnderlyingObject(Ptr, DL)); - if (Alloca) { - // We want to do whatever we can to limit the number of alloca - // instructions that make it through to the code generator. allocas - // require us to use indirect addressing, which is slow and prone to - // compiler bugs. If this loop does an address calculation on an - // alloca ptr, then we want to use a higher than normal loop unroll - // threshold. This will give SROA a better chance to eliminate these - // allocas. - // - // Don't use the maximum allowed value here as it will make some - // programs way too big. - UP.Threshold = 800; - } - } - } -} - -unsigned AMDGPUTTIImpl::getNumberOfRegisters(bool Vec) { - if (Vec) - return 0; - - // Number of VGPRs on SI. - if (ST->getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) - return 256; - - return 4 * 128; // XXX - 4 channels. Should these count as vector instead? -} - -unsigned AMDGPUTTIImpl::getRegisterBitWidth(bool) { return 32; } - -unsigned AMDGPUTTIImpl::getMaxInterleaveFactor(unsigned VF) { - // Semi-arbitrary large amount. - return 64; -} Index: projects/clang370-import/contrib/llvm/lib/Target/R600/AMDGPUAlwaysInlinePass.cpp =================================================================== --- projects/clang370-import/contrib/llvm/lib/Target/R600/AMDGPUAlwaysInlinePass.cpp (revision 288077) +++ projects/clang370-import/contrib/llvm/lib/Target/R600/AMDGPUAlwaysInlinePass.cpp (nonexistent) @@ -1,67 +0,0 @@ -//===-- AMDGPUAlwaysInlinePass.cpp - Promote Allocas ----------------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -/// \file -/// This pass marks all internal functions as always_inline and creates -/// duplicates of all other functions a marks the duplicates as always_inline. -// -//===----------------------------------------------------------------------===// - -#include "AMDGPU.h" -#include "llvm/IR/Module.h" -#include "llvm/Transforms/Utils/Cloning.h" - -using namespace llvm; - -namespace { - -class AMDGPUAlwaysInline : public ModulePass { - - static char ID; - -public: - AMDGPUAlwaysInline() : ModulePass(ID) { } - bool runOnModule(Module &M) override; - const char *getPassName() const override { return "AMDGPU Always Inline Pass"; } -}; - -} // End anonymous namespace - -char AMDGPUAlwaysInline::ID = 0; - -bool AMDGPUAlwaysInline::runOnModule(Module &M) { - - std::vector FuncsToClone; - for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) { - Function &F = *I; - if (!F.hasLocalLinkage() && !F.isDeclaration() && !F.use_empty() && - !F.hasFnAttribute(Attribute::NoInline)) - FuncsToClone.push_back(&F); - } - - for (Function *F : FuncsToClone) { - ValueToValueMapTy VMap; - Function *NewFunc = CloneFunction(F, VMap, false); - NewFunc->setLinkage(GlobalValue::InternalLinkage); - F->getParent()->getFunctionList().push_back(NewFunc); - F->replaceAllUsesWith(NewFunc); - } - - for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) { - Function &F = *I; - if (F.hasLocalLinkage() && !F.hasFnAttribute(Attribute::NoInline)) { - F.addFnAttr(Attribute::AlwaysInline); - } - } - return false; -} - -ModulePass *llvm::createAMDGPUAlwaysInlinePass() { - return new AMDGPUAlwaysInline(); -} Property changes on: projects/clang370-import/contrib/llvm/lib/Target/R600/AMDGPUAlwaysInlinePass.cpp ___________________________________________________________________ Deleted: svn:eol-style ## -1 +0,0 ## -native \ No newline at end of property Deleted: svn:keywords ## -1 +0,0 ## -FreeBSD=%H \ No newline at end of property Deleted: svn:mime-type ## -1 +0,0 ## -text/plain \ No newline at end of property Index: projects/clang370-import/contrib/llvm/lib/Target/R600/R600InstrInfo.cpp =================================================================== --- projects/clang370-import/contrib/llvm/lib/Target/R600/R600InstrInfo.cpp (revision 288077) +++ projects/clang370-import/contrib/llvm/lib/Target/R600/R600InstrInfo.cpp (nonexistent) @@ -1,1436 +0,0 @@ -//===-- R600InstrInfo.cpp - R600 Instruction Information ------------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -/// \file -/// \brief R600 Implementation of TargetInstrInfo. -// -//===----------------------------------------------------------------------===// - -#include "R600InstrInfo.h" -#include "AMDGPU.h" -#include "AMDGPUSubtarget.h" -#include "AMDGPUTargetMachine.h" -#include "R600Defines.h" -#include "R600MachineFunctionInfo.h" -#include "R600RegisterInfo.h" -#include "llvm/CodeGen/MachineFrameInfo.h" -#include "llvm/CodeGen/MachineInstrBuilder.h" -#include "llvm/CodeGen/MachineRegisterInfo.h" - -using namespace llvm; - -#define GET_INSTRINFO_CTOR_DTOR -#include "AMDGPUGenDFAPacketizer.inc" - -R600InstrInfo::R600InstrInfo(const AMDGPUSubtarget &st) - : AMDGPUInstrInfo(st), RI() {} - -const R600RegisterInfo &R600InstrInfo::getRegisterInfo() const { - return RI; -} - -bool R600InstrInfo::isTrig(const MachineInstr &MI) const { - return get(MI.getOpcode()).TSFlags & R600_InstFlag::TRIG; -} - -bool R600InstrInfo::isVector(const MachineInstr &MI) const { - return get(MI.getOpcode()).TSFlags & R600_InstFlag::VECTOR; -} - -void -R600InstrInfo::copyPhysReg(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MI, DebugLoc DL, - unsigned DestReg, unsigned SrcReg, - bool KillSrc) const { - unsigned VectorComponents = 0; - if ((AMDGPU::R600_Reg128RegClass.contains(DestReg) || - AMDGPU::R600_Reg128VerticalRegClass.contains(DestReg)) && - (AMDGPU::R600_Reg128RegClass.contains(SrcReg) || - AMDGPU::R600_Reg128VerticalRegClass.contains(SrcReg))) { - VectorComponents = 4; - } else if((AMDGPU::R600_Reg64RegClass.contains(DestReg) || - AMDGPU::R600_Reg64VerticalRegClass.contains(DestReg)) && - (AMDGPU::R600_Reg64RegClass.contains(SrcReg) || - AMDGPU::R600_Reg64VerticalRegClass.contains(SrcReg))) { - VectorComponents = 2; - } - - if (VectorComponents > 0) { - for (unsigned I = 0; I < VectorComponents; I++) { - unsigned SubRegIndex = RI.getSubRegFromChannel(I); - buildDefaultInstruction(MBB, MI, AMDGPU::MOV, - RI.getSubReg(DestReg, SubRegIndex), - RI.getSubReg(SrcReg, SubRegIndex)) - .addReg(DestReg, - RegState::Define | RegState::Implicit); - } - } else { - MachineInstr *NewMI = buildDefaultInstruction(MBB, MI, AMDGPU::MOV, - DestReg, SrcReg); - NewMI->getOperand(getOperandIdx(*NewMI, AMDGPU::OpName::src0)) - .setIsKill(KillSrc); - } -} - -/// \returns true if \p MBBI can be moved into a new basic. -bool R600InstrInfo::isLegalToSplitMBBAt(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MBBI) const { - for (MachineInstr::const_mop_iterator I = MBBI->operands_begin(), - E = MBBI->operands_end(); I != E; ++I) { - if (I->isReg() && !TargetRegisterInfo::isVirtualRegister(I->getReg()) && - I->isUse() && RI.isPhysRegLiveAcrossClauses(I->getReg())) - return false; - } - return true; -} - -bool R600InstrInfo::isMov(unsigned Opcode) const { - - - switch(Opcode) { - default: return false; - case AMDGPU::MOV: - case AMDGPU::MOV_IMM_F32: - case AMDGPU::MOV_IMM_I32: - return true; - } -} - -// Some instructions act as place holders to emulate operations that the GPU -// hardware does automatically. This function can be used to check if -// an opcode falls into this category. -bool R600InstrInfo::isPlaceHolderOpcode(unsigned Opcode) const { - switch (Opcode) { - default: return false; - case AMDGPU::RETURN: - return true; - } -} - -bool R600InstrInfo::isReductionOp(unsigned Opcode) const { - return false; -} - -bool R600InstrInfo::isCubeOp(unsigned Opcode) const { - switch(Opcode) { - default: return false; - case AMDGPU::CUBE_r600_pseudo: - case AMDGPU::CUBE_r600_real: - case AMDGPU::CUBE_eg_pseudo: - case AMDGPU::CUBE_eg_real: - return true; - } -} - -bool R600InstrInfo::isALUInstr(unsigned Opcode) const { - unsigned TargetFlags = get(Opcode).TSFlags; - - return (TargetFlags & R600_InstFlag::ALU_INST); -} - -bool R600InstrInfo::hasInstrModifiers(unsigned Opcode) const { - unsigned TargetFlags = get(Opcode).TSFlags; - - return ((TargetFlags & R600_InstFlag::OP1) | - (TargetFlags & R600_InstFlag::OP2) | - (TargetFlags & R600_InstFlag::OP3)); -} - -bool R600InstrInfo::isLDSInstr(unsigned Opcode) const { - unsigned TargetFlags = get(Opcode).TSFlags; - - return ((TargetFlags & R600_InstFlag::LDS_1A) | - (TargetFlags & R600_InstFlag::LDS_1A1D) | - (TargetFlags & R600_InstFlag::LDS_1A2D)); -} - -bool R600InstrInfo::isLDSNoRetInstr(unsigned Opcode) const { - return isLDSInstr(Opcode) && getOperandIdx(Opcode, AMDGPU::OpName::dst) == -1; -} - -bool R600InstrInfo::isLDSRetInstr(unsigned Opcode) const { - return isLDSInstr(Opcode) && getOperandIdx(Opcode, AMDGPU::OpName::dst) != -1; -} - -bool R600InstrInfo::canBeConsideredALU(const MachineInstr *MI) const { - if (isALUInstr(MI->getOpcode())) - return true; - if (isVector(*MI) || isCubeOp(MI->getOpcode())) - return true; - switch (MI->getOpcode()) { - case AMDGPU::PRED_X: - case AMDGPU::INTERP_PAIR_XY: - case AMDGPU::INTERP_PAIR_ZW: - case AMDGPU::INTERP_VEC_LOAD: - case AMDGPU::COPY: - case AMDGPU::DOT_4: - return true; - default: - return false; - } -} - -bool R600InstrInfo::isTransOnly(unsigned Opcode) const { - if (ST.hasCaymanISA()) - return false; - return (get(Opcode).getSchedClass() == AMDGPU::Sched::TransALU); -} - -bool R600InstrInfo::isTransOnly(const MachineInstr *MI) const { - return isTransOnly(MI->getOpcode()); -} - -bool R600InstrInfo::isVectorOnly(unsigned Opcode) const { - return (get(Opcode).getSchedClass() == AMDGPU::Sched::VecALU); -} - -bool R600InstrInfo::isVectorOnly(const MachineInstr *MI) const { - return isVectorOnly(MI->getOpcode()); -} - -bool R600InstrInfo::isExport(unsigned Opcode) const { - return (get(Opcode).TSFlags & R600_InstFlag::IS_EXPORT); -} - -bool R600InstrInfo::usesVertexCache(unsigned Opcode) const { - return ST.hasVertexCache() && IS_VTX(get(Opcode)); -} - -bool R600InstrInfo::usesVertexCache(const MachineInstr *MI) const { - const MachineFunction *MF = MI->getParent()->getParent(); - const R600MachineFunctionInfo *MFI = MF->getInfo(); - return MFI->getShaderType() != ShaderType::COMPUTE && - usesVertexCache(MI->getOpcode()); -} - -bool R600InstrInfo::usesTextureCache(unsigned Opcode) const { - return (!ST.hasVertexCache() && IS_VTX(get(Opcode))) || IS_TEX(get(Opcode)); -} - -bool R600InstrInfo::usesTextureCache(const MachineInstr *MI) const { - const MachineFunction *MF = MI->getParent()->getParent(); - const R600MachineFunctionInfo *MFI = MF->getInfo(); - return (MFI->getShaderType() == ShaderType::COMPUTE && - usesVertexCache(MI->getOpcode())) || - usesTextureCache(MI->getOpcode()); -} - -bool R600InstrInfo::mustBeLastInClause(unsigned Opcode) const { - switch (Opcode) { - case AMDGPU::KILLGT: - case AMDGPU::GROUP_BARRIER: - return true; - default: - return false; - } -} - -bool R600InstrInfo::usesAddressRegister(MachineInstr *MI) const { - return MI->findRegisterUseOperandIdx(AMDGPU::AR_X) != -1; -} - -bool R600InstrInfo::definesAddressRegister(MachineInstr *MI) const { - return MI->findRegisterDefOperandIdx(AMDGPU::AR_X) != -1; -} - -bool R600InstrInfo::readsLDSSrcReg(const MachineInstr *MI) const { - if (!isALUInstr(MI->getOpcode())) { - return false; - } - for (MachineInstr::const_mop_iterator I = MI->operands_begin(), - E = MI->operands_end(); I != E; ++I) { - if (!I->isReg() || !I->isUse() || - TargetRegisterInfo::isVirtualRegister(I->getReg())) - continue; - - if (AMDGPU::R600_LDS_SRC_REGRegClass.contains(I->getReg())) - return true; - } - return false; -} - -int R600InstrInfo::getSrcIdx(unsigned Opcode, unsigned SrcNum) const { - static const unsigned OpTable[] = { - AMDGPU::OpName::src0, - AMDGPU::OpName::src1, - AMDGPU::OpName::src2 - }; - - assert (SrcNum < 3); - return getOperandIdx(Opcode, OpTable[SrcNum]); -} - -int R600InstrInfo::getSelIdx(unsigned Opcode, unsigned SrcIdx) const { - static const unsigned SrcSelTable[][2] = { - {AMDGPU::OpName::src0, AMDGPU::OpName::src0_sel}, - {AMDGPU::OpName::src1, AMDGPU::OpName::src1_sel}, - {AMDGPU::OpName::src2, AMDGPU::OpName::src2_sel}, - {AMDGPU::OpName::src0_X, AMDGPU::OpName::src0_sel_X}, - {AMDGPU::OpName::src0_Y, AMDGPU::OpName::src0_sel_Y}, - {AMDGPU::OpName::src0_Z, AMDGPU::OpName::src0_sel_Z}, - {AMDGPU::OpName::src0_W, AMDGPU::OpName::src0_sel_W}, - {AMDGPU::OpName::src1_X, AMDGPU::OpName::src1_sel_X}, - {AMDGPU::OpName::src1_Y, AMDGPU::OpName::src1_sel_Y}, - {AMDGPU::OpName::src1_Z, AMDGPU::OpName::src1_sel_Z}, - {AMDGPU::OpName::src1_W, AMDGPU::OpName::src1_sel_W} - }; - - for (const auto &Row : SrcSelTable) { - if (getOperandIdx(Opcode, Row[0]) == (int)SrcIdx) { - return getOperandIdx(Opcode, Row[1]); - } - } - return -1; -} - -SmallVector, 3> -R600InstrInfo::getSrcs(MachineInstr *MI) const { - SmallVector, 3> Result; - - if (MI->getOpcode() == AMDGPU::DOT_4) { - static const unsigned OpTable[8][2] = { - {AMDGPU::OpName::src0_X, AMDGPU::OpName::src0_sel_X}, - {AMDGPU::OpName::src0_Y, AMDGPU::OpName::src0_sel_Y}, - {AMDGPU::OpName::src0_Z, AMDGPU::OpName::src0_sel_Z}, - {AMDGPU::OpName::src0_W, AMDGPU::OpName::src0_sel_W}, - {AMDGPU::OpName::src1_X, AMDGPU::OpName::src1_sel_X}, - {AMDGPU::OpName::src1_Y, AMDGPU::OpName::src1_sel_Y}, - {AMDGPU::OpName::src1_Z, AMDGPU::OpName::src1_sel_Z}, - {AMDGPU::OpName::src1_W, AMDGPU::OpName::src1_sel_W}, - }; - - for (unsigned j = 0; j < 8; j++) { - MachineOperand &MO = MI->getOperand(getOperandIdx(MI->getOpcode(), - OpTable[j][0])); - unsigned Reg = MO.getReg(); - if (Reg == AMDGPU::ALU_CONST) { - unsigned Sel = MI->getOperand(getOperandIdx(MI->getOpcode(), - OpTable[j][1])).getImm(); - Result.push_back(std::pair(&MO, Sel)); - continue; - } - - } - return Result; - } - - static const unsigned OpTable[3][2] = { - {AMDGPU::OpName::src0, AMDGPU::OpName::src0_sel}, - {AMDGPU::OpName::src1, AMDGPU::OpName::src1_sel}, - {AMDGPU::OpName::src2, AMDGPU::OpName::src2_sel}, - }; - - for (unsigned j = 0; j < 3; j++) { - int SrcIdx = getOperandIdx(MI->getOpcode(), OpTable[j][0]); - if (SrcIdx < 0) - break; - MachineOperand &MO = MI->getOperand(SrcIdx); - unsigned Reg = MI->getOperand(SrcIdx).getReg(); - if (Reg == AMDGPU::ALU_CONST) { - unsigned Sel = MI->getOperand( - getOperandIdx(MI->getOpcode(), OpTable[j][1])).getImm(); - Result.push_back(std::pair(&MO, Sel)); - continue; - } - if (Reg == AMDGPU::ALU_LITERAL_X) { - unsigned Imm = MI->getOperand( - getOperandIdx(MI->getOpcode(), AMDGPU::OpName::literal)).getImm(); - Result.push_back(std::pair(&MO, Imm)); - continue; - } - Result.push_back(std::pair(&MO, 0)); - } - return Result; -} - -std::vector > -R600InstrInfo::ExtractSrcs(MachineInstr *MI, - const DenseMap &PV, - unsigned &ConstCount) const { - ConstCount = 0; - const SmallVector, 3> Srcs = getSrcs(MI); - const std::pair DummyPair(-1, 0); - std::vector > Result; - unsigned i = 0; - for (unsigned n = Srcs.size(); i < n; ++i) { - unsigned Reg = Srcs[i].first->getReg(); - unsigned Index = RI.getEncodingValue(Reg) & 0xff; - if (Reg == AMDGPU::OQAP) { - Result.push_back(std::pair(Index, 0)); - } - if (PV.find(Reg) != PV.end()) { - // 255 is used to tells its a PS/PV reg - Result.push_back(std::pair(255, 0)); - continue; - } - if (Index > 127) { - ConstCount++; - Result.push_back(DummyPair); - continue; - } - unsigned Chan = RI.getHWRegChan(Reg); - Result.push_back(std::pair(Index, Chan)); - } - for (; i < 3; ++i) - Result.push_back(DummyPair); - return Result; -} - -static std::vector > -Swizzle(std::vector > Src, - R600InstrInfo::BankSwizzle Swz) { - if (Src[0] == Src[1]) - Src[1].first = -1; - switch (Swz) { - case R600InstrInfo::ALU_VEC_012_SCL_210: - break; - case R600InstrInfo::ALU_VEC_021_SCL_122: - std::swap(Src[1], Src[2]); - break; - case R600InstrInfo::ALU_VEC_102_SCL_221: - std::swap(Src[0], Src[1]); - break; - case R600InstrInfo::ALU_VEC_120_SCL_212: - std::swap(Src[0], Src[1]); - std::swap(Src[0], Src[2]); - break; - case R600InstrInfo::ALU_VEC_201: - std::swap(Src[0], Src[2]); - std::swap(Src[0], Src[1]); - break; - case R600InstrInfo::ALU_VEC_210: - std::swap(Src[0], Src[2]); - break; - } - return Src; -} - -static unsigned -getTransSwizzle(R600InstrInfo::BankSwizzle Swz, unsigned Op) { - switch (Swz) { - case R600InstrInfo::ALU_VEC_012_SCL_210: { - unsigned Cycles[3] = { 2, 1, 0}; - return Cycles[Op]; - } - case R600InstrInfo::ALU_VEC_021_SCL_122: { - unsigned Cycles[3] = { 1, 2, 2}; - return Cycles[Op]; - } - case R600InstrInfo::ALU_VEC_120_SCL_212: { - unsigned Cycles[3] = { 2, 1, 2}; - return Cycles[Op]; - } - case R600InstrInfo::ALU_VEC_102_SCL_221: { - unsigned Cycles[3] = { 2, 2, 1}; - return Cycles[Op]; - } - default: - llvm_unreachable("Wrong Swizzle for Trans Slot"); - return 0; - } -} - -/// returns how many MIs (whose inputs are represented by IGSrcs) can be packed -/// in the same Instruction Group while meeting read port limitations given a -/// Swz swizzle sequence. -unsigned R600InstrInfo::isLegalUpTo( - const std::vector > > &IGSrcs, - const std::vector &Swz, - const std::vector > &TransSrcs, - R600InstrInfo::BankSwizzle TransSwz) const { - int Vector[4][3]; - memset(Vector, -1, sizeof(Vector)); - for (unsigned i = 0, e = IGSrcs.size(); i < e; i++) { - const std::vector > &Srcs = - Swizzle(IGSrcs[i], Swz[i]); - for (unsigned j = 0; j < 3; j++) { - const std::pair &Src = Srcs[j]; - if (Src.first < 0 || Src.first == 255) - continue; - if (Src.first == GET_REG_INDEX(RI.getEncodingValue(AMDGPU::OQAP))) { - if (Swz[i] != R600InstrInfo::ALU_VEC_012_SCL_210 && - Swz[i] != R600InstrInfo::ALU_VEC_021_SCL_122) { - // The value from output queue A (denoted by register OQAP) can - // only be fetched during the first cycle. - return false; - } - // OQAP does not count towards the normal read port restrictions - continue; - } - if (Vector[Src.second][j] < 0) - Vector[Src.second][j] = Src.first; - if (Vector[Src.second][j] != Src.first) - return i; - } - } - // Now check Trans Alu - for (unsigned i = 0, e = TransSrcs.size(); i < e; ++i) { - const std::pair &Src = TransSrcs[i]; - unsigned Cycle = getTransSwizzle(TransSwz, i); - if (Src.first < 0) - continue; - if (Src.first == 255) - continue; - if (Vector[Src.second][Cycle] < 0) - Vector[Src.second][Cycle] = Src.first; - if (Vector[Src.second][Cycle] != Src.first) - return IGSrcs.size() - 1; - } - return IGSrcs.size(); -} - -/// Given a swizzle sequence SwzCandidate and an index Idx, returns the next -/// (in lexicographic term) swizzle sequence assuming that all swizzles after -/// Idx can be skipped -static bool -NextPossibleSolution( - std::vector &SwzCandidate, - unsigned Idx) { - assert(Idx < SwzCandidate.size()); - int ResetIdx = Idx; - while (ResetIdx > -1 && SwzCandidate[ResetIdx] == R600InstrInfo::ALU_VEC_210) - ResetIdx --; - for (unsigned i = ResetIdx + 1, e = SwzCandidate.size(); i < e; i++) { - SwzCandidate[i] = R600InstrInfo::ALU_VEC_012_SCL_210; - } - if (ResetIdx == -1) - return false; - int NextSwizzle = SwzCandidate[ResetIdx] + 1; - SwzCandidate[ResetIdx] = (R600InstrInfo::BankSwizzle)NextSwizzle; - return true; -} - -/// Enumerate all possible Swizzle sequence to find one that can meet all -/// read port requirements. -bool R600InstrInfo::FindSwizzleForVectorSlot( - const std::vector > > &IGSrcs, - std::vector &SwzCandidate, - const std::vector > &TransSrcs, - R600InstrInfo::BankSwizzle TransSwz) const { - unsigned ValidUpTo = 0; - do { - ValidUpTo = isLegalUpTo(IGSrcs, SwzCandidate, TransSrcs, TransSwz); - if (ValidUpTo == IGSrcs.size()) - return true; - } while (NextPossibleSolution(SwzCandidate, ValidUpTo)); - return false; -} - -/// Instructions in Trans slot can't read gpr at cycle 0 if they also read -/// a const, and can't read a gpr at cycle 1 if they read 2 const. -static bool -isConstCompatible(R600InstrInfo::BankSwizzle TransSwz, - const std::vector > &TransOps, - unsigned ConstCount) { - // TransALU can't read 3 constants - if (ConstCount > 2) - return false; - for (unsigned i = 0, e = TransOps.size(); i < e; ++i) { - const std::pair &Src = TransOps[i]; - unsigned Cycle = getTransSwizzle(TransSwz, i); - if (Src.first < 0) - continue; - if (ConstCount > 0 && Cycle == 0) - return false; - if (ConstCount > 1 && Cycle == 1) - return false; - } - return true; -} - -bool -R600InstrInfo::fitsReadPortLimitations(const std::vector &IG, - const DenseMap &PV, - std::vector &ValidSwizzle, - bool isLastAluTrans) - const { - //Todo : support shared src0 - src1 operand - - std::vector > > IGSrcs; - ValidSwizzle.clear(); - unsigned ConstCount; - BankSwizzle TransBS = ALU_VEC_012_SCL_210; - for (unsigned i = 0, e = IG.size(); i < e; ++i) { - IGSrcs.push_back(ExtractSrcs(IG[i], PV, ConstCount)); - unsigned Op = getOperandIdx(IG[i]->getOpcode(), - AMDGPU::OpName::bank_swizzle); - ValidSwizzle.push_back( (R600InstrInfo::BankSwizzle) - IG[i]->getOperand(Op).getImm()); - } - std::vector > TransOps; - if (!isLastAluTrans) - return FindSwizzleForVectorSlot(IGSrcs, ValidSwizzle, TransOps, TransBS); - - TransOps = std::move(IGSrcs.back()); - IGSrcs.pop_back(); - ValidSwizzle.pop_back(); - - static const R600InstrInfo::BankSwizzle TransSwz[] = { - ALU_VEC_012_SCL_210, - ALU_VEC_021_SCL_122, - ALU_VEC_120_SCL_212, - ALU_VEC_102_SCL_221 - }; - for (unsigned i = 0; i < 4; i++) { - TransBS = TransSwz[i]; - if (!isConstCompatible(TransBS, TransOps, ConstCount)) - continue; - bool Result = FindSwizzleForVectorSlot(IGSrcs, ValidSwizzle, TransOps, - TransBS); - if (Result) { - ValidSwizzle.push_back(TransBS); - return true; - } - } - - return false; -} - - -bool -R600InstrInfo::fitsConstReadLimitations(const std::vector &Consts) - const { - assert (Consts.size() <= 12 && "Too many operands in instructions group"); - unsigned Pair1 = 0, Pair2 = 0; - for (unsigned i = 0, n = Consts.size(); i < n; ++i) { - unsigned ReadConstHalf = Consts[i] & 2; - unsigned ReadConstIndex = Consts[i] & (~3); - unsigned ReadHalfConst = ReadConstIndex | ReadConstHalf; - if (!Pair1) { - Pair1 = ReadHalfConst; - continue; - } - if (Pair1 == ReadHalfConst) - continue; - if (!Pair2) { - Pair2 = ReadHalfConst; - continue; - } - if (Pair2 != ReadHalfConst) - return false; - } - return true; -} - -bool -R600InstrInfo::fitsConstReadLimitations(const std::vector &MIs) - const { - std::vector Consts; - SmallSet Literals; - for (unsigned i = 0, n = MIs.size(); i < n; i++) { - MachineInstr *MI = MIs[i]; - if (!isALUInstr(MI->getOpcode())) - continue; - - const SmallVectorImpl > &Srcs = - getSrcs(MI); - - for (unsigned j = 0, e = Srcs.size(); j < e; j++) { - std::pair Src = Srcs[j]; - if (Src.first->getReg() == AMDGPU::ALU_LITERAL_X) - Literals.insert(Src.second); - if (Literals.size() > 4) - return false; - if (Src.first->getReg() == AMDGPU::ALU_CONST) - Consts.push_back(Src.second); - if (AMDGPU::R600_KC0RegClass.contains(Src.first->getReg()) || - AMDGPU::R600_KC1RegClass.contains(Src.first->getReg())) { - unsigned Index = RI.getEncodingValue(Src.first->getReg()) & 0xff; - unsigned Chan = RI.getHWRegChan(Src.first->getReg()); - Consts.push_back((Index << 2) | Chan); - } - } - } - return fitsConstReadLimitations(Consts); -} - -DFAPacketizer * -R600InstrInfo::CreateTargetScheduleState(const TargetSubtargetInfo &STI) const { - const InstrItineraryData *II = STI.getInstrItineraryData(); - return static_cast(STI).createDFAPacketizer(II); -} - -static bool -isPredicateSetter(unsigned Opcode) { - switch (Opcode) { - case AMDGPU::PRED_X: - return true; - default: - return false; - } -} - -static MachineInstr * -findFirstPredicateSetterFrom(MachineBasicBlock &MBB, - MachineBasicBlock::iterator I) { - while (I != MBB.begin()) { - --I; - MachineInstr *MI = I; - if (isPredicateSetter(MI->getOpcode())) - return MI; - } - - return nullptr; -} - -static -bool isJump(unsigned Opcode) { - return Opcode == AMDGPU::JUMP || Opcode == AMDGPU::JUMP_COND; -} - -static bool isBranch(unsigned Opcode) { - return Opcode == AMDGPU::BRANCH || Opcode == AMDGPU::BRANCH_COND_i32 || - Opcode == AMDGPU::BRANCH_COND_f32; -} - -bool -R600InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB, - MachineBasicBlock *&TBB, - MachineBasicBlock *&FBB, - SmallVectorImpl &Cond, - bool AllowModify) const { - // Most of the following comes from the ARM implementation of AnalyzeBranch - - // If the block has no terminators, it just falls into the block after it. - MachineBasicBlock::iterator I = MBB.end(); - if (I == MBB.begin()) - return false; - --I; - while (I->isDebugValue()) { - if (I == MBB.begin()) - return false; - --I; - } - // AMDGPU::BRANCH* instructions are only available after isel and are not - // handled - if (isBranch(I->getOpcode())) - return true; - if (!isJump(static_cast(I)->getOpcode())) { - return false; - } - - // Remove successive JUMP - while (I != MBB.begin() && std::prev(I)->getOpcode() == AMDGPU::JUMP) { - MachineBasicBlock::iterator PriorI = std::prev(I); - if (AllowModify) - I->removeFromParent(); - I = PriorI; - } - MachineInstr *LastInst = I; - - // If there is only one terminator instruction, process it. - unsigned LastOpc = LastInst->getOpcode(); - if (I == MBB.begin() || - !isJump(static_cast(--I)->getOpcode())) { - if (LastOpc == AMDGPU::JUMP) { - TBB = LastInst->getOperand(0).getMBB(); - return false; - } else if (LastOpc == AMDGPU::JUMP_COND) { - MachineInstr *predSet = I; - while (!isPredicateSetter(predSet->getOpcode())) { - predSet = --I; - } - TBB = LastInst->getOperand(0).getMBB(); - Cond.push_back(predSet->getOperand(1)); - Cond.push_back(predSet->getOperand(2)); - Cond.push_back(MachineOperand::CreateReg(AMDGPU::PRED_SEL_ONE, false)); - return false; - } - return true; // Can't handle indirect branch. - } - - // Get the instruction before it if it is a terminator. - MachineInstr *SecondLastInst = I; - unsigned SecondLastOpc = SecondLastInst->getOpcode(); - - // If the block ends with a B and a Bcc, handle it. - if (SecondLastOpc == AMDGPU::JUMP_COND && LastOpc == AMDGPU::JUMP) { - MachineInstr *predSet = --I; - while (!isPredicateSetter(predSet->getOpcode())) { - predSet = --I; - } - TBB = SecondLastInst->getOperand(0).getMBB(); - FBB = LastInst->getOperand(0).getMBB(); - Cond.push_back(predSet->getOperand(1)); - Cond.push_back(predSet->getOperand(2)); - Cond.push_back(MachineOperand::CreateReg(AMDGPU::PRED_SEL_ONE, false)); - return false; - } - - // Otherwise, can't handle this. - return true; -} - -static -MachineBasicBlock::iterator FindLastAluClause(MachineBasicBlock &MBB) { - for (MachineBasicBlock::reverse_iterator It = MBB.rbegin(), E = MBB.rend(); - It != E; ++It) { - if (It->getOpcode() == AMDGPU::CF_ALU || - It->getOpcode() == AMDGPU::CF_ALU_PUSH_BEFORE) - return std::prev(It.base()); - } - return MBB.end(); -} - -unsigned -R600InstrInfo::InsertBranch(MachineBasicBlock &MBB, - MachineBasicBlock *TBB, - MachineBasicBlock *FBB, - const SmallVectorImpl &Cond, - DebugLoc DL) const { - assert(TBB && "InsertBranch must not be told to insert a fallthrough"); - - if (!FBB) { - if (Cond.empty()) { - BuildMI(&MBB, DL, get(AMDGPU::JUMP)).addMBB(TBB); - return 1; - } else { - MachineInstr *PredSet = findFirstPredicateSetterFrom(MBB, MBB.end()); - assert(PredSet && "No previous predicate !"); - addFlag(PredSet, 0, MO_FLAG_PUSH); - PredSet->getOperand(2).setImm(Cond[1].getImm()); - - BuildMI(&MBB, DL, get(AMDGPU::JUMP_COND)) - .addMBB(TBB) - .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill); - MachineBasicBlock::iterator CfAlu = FindLastAluClause(MBB); - if (CfAlu == MBB.end()) - return 1; - assert (CfAlu->getOpcode() == AMDGPU::CF_ALU); - CfAlu->setDesc(get(AMDGPU::CF_ALU_PUSH_BEFORE)); - return 1; - } - } else { - MachineInstr *PredSet = findFirstPredicateSetterFrom(MBB, MBB.end()); - assert(PredSet && "No previous predicate !"); - addFlag(PredSet, 0, MO_FLAG_PUSH); - PredSet->getOperand(2).setImm(Cond[1].getImm()); - BuildMI(&MBB, DL, get(AMDGPU::JUMP_COND)) - .addMBB(TBB) - .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill); - BuildMI(&MBB, DL, get(AMDGPU::JUMP)).addMBB(FBB); - MachineBasicBlock::iterator CfAlu = FindLastAluClause(MBB); - if (CfAlu == MBB.end()) - return 2; - assert (CfAlu->getOpcode() == AMDGPU::CF_ALU); - CfAlu->setDesc(get(AMDGPU::CF_ALU_PUSH_BEFORE)); - return 2; - } -} - -unsigned -R600InstrInfo::RemoveBranch(MachineBasicBlock &MBB) const { - - // Note : we leave PRED* instructions there. - // They may be needed when predicating instructions. - - MachineBasicBlock::iterator I = MBB.end(); - - if (I == MBB.begin()) { - return 0; - } - --I; - switch (I->getOpcode()) { - default: - return 0; - case AMDGPU::JUMP_COND: { - MachineInstr *predSet = findFirstPredicateSetterFrom(MBB, I); - clearFlag(predSet, 0, MO_FLAG_PUSH); - I->eraseFromParent(); - MachineBasicBlock::iterator CfAlu = FindLastAluClause(MBB); - if (CfAlu == MBB.end()) - break; - assert (CfAlu->getOpcode() == AMDGPU::CF_ALU_PUSH_BEFORE); - CfAlu->setDesc(get(AMDGPU::CF_ALU)); - break; - } - case AMDGPU::JUMP: - I->eraseFromParent(); - break; - } - I = MBB.end(); - - if (I == MBB.begin()) { - return 1; - } - --I; - switch (I->getOpcode()) { - // FIXME: only one case?? - default: - return 1; - case AMDGPU::JUMP_COND: { - MachineInstr *predSet = findFirstPredicateSetterFrom(MBB, I); - clearFlag(predSet, 0, MO_FLAG_PUSH); - I->eraseFromParent(); - MachineBasicBlock::iterator CfAlu = FindLastAluClause(MBB); - if (CfAlu == MBB.end()) - break; - assert (CfAlu->getOpcode() == AMDGPU::CF_ALU_PUSH_BEFORE); - CfAlu->setDesc(get(AMDGPU::CF_ALU)); - break; - } - case AMDGPU::JUMP: - I->eraseFromParent(); - break; - } - return 2; -} - -bool -R600InstrInfo::isPredicated(const MachineInstr *MI) const { - int idx = MI->findFirstPredOperandIdx(); - if (idx < 0) - return false; - - unsigned Reg = MI->getOperand(idx).getReg(); - switch (Reg) { - default: return false; - case AMDGPU::PRED_SEL_ONE: - case AMDGPU::PRED_SEL_ZERO: - case AMDGPU::PREDICATE_BIT: - return true; - } -} - -bool -R600InstrInfo::isPredicable(MachineInstr *MI) const { - // XXX: KILL* instructions can be predicated, but they must be the last - // instruction in a clause, so this means any instructions after them cannot - // be predicated. Until we have proper support for instruction clauses in the - // backend, we will mark KILL* instructions as unpredicable. - - if (MI->getOpcode() == AMDGPU::KILLGT) { - return false; - } else if (MI->getOpcode() == AMDGPU::CF_ALU) { - // If the clause start in the middle of MBB then the MBB has more - // than a single clause, unable to predicate several clauses. - if (MI->getParent()->begin() != MachineBasicBlock::iterator(MI)) - return false; - // TODO: We don't support KC merging atm - if (MI->getOperand(3).getImm() != 0 || MI->getOperand(4).getImm() != 0) - return false; - return true; - } else if (isVector(*MI)) { - return false; - } else { - return AMDGPUInstrInfo::isPredicable(MI); - } -} - - -bool -R600InstrInfo::isProfitableToIfCvt(MachineBasicBlock &MBB, - unsigned NumCyles, - unsigned ExtraPredCycles, - const BranchProbability &Probability) const{ - return true; -} - -bool -R600InstrInfo::isProfitableToIfCvt(MachineBasicBlock &TMBB, - unsigned NumTCycles, - unsigned ExtraTCycles, - MachineBasicBlock &FMBB, - unsigned NumFCycles, - unsigned ExtraFCycles, - const BranchProbability &Probability) const { - return true; -} - -bool -R600InstrInfo::isProfitableToDupForIfCvt(MachineBasicBlock &MBB, - unsigned NumCyles, - const BranchProbability &Probability) - const { - return true; -} - -bool -R600InstrInfo::isProfitableToUnpredicate(MachineBasicBlock &TMBB, - MachineBasicBlock &FMBB) const { - return false; -} - - -bool -R600InstrInfo::ReverseBranchCondition(SmallVectorImpl &Cond) const { - MachineOperand &MO = Cond[1]; - switch (MO.getImm()) { - case OPCODE_IS_ZERO_INT: - MO.setImm(OPCODE_IS_NOT_ZERO_INT); - break; - case OPCODE_IS_NOT_ZERO_INT: - MO.setImm(OPCODE_IS_ZERO_INT); - break; - case OPCODE_IS_ZERO: - MO.setImm(OPCODE_IS_NOT_ZERO); - break; - case OPCODE_IS_NOT_ZERO: - MO.setImm(OPCODE_IS_ZERO); - break; - default: - return true; - } - - MachineOperand &MO2 = Cond[2]; - switch (MO2.getReg()) { - case AMDGPU::PRED_SEL_ZERO: - MO2.setReg(AMDGPU::PRED_SEL_ONE); - break; - case AMDGPU::PRED_SEL_ONE: - MO2.setReg(AMDGPU::PRED_SEL_ZERO); - break; - default: - return true; - } - return false; -} - -bool -R600InstrInfo::DefinesPredicate(MachineInstr *MI, - std::vector &Pred) const { - return isPredicateSetter(MI->getOpcode()); -} - - -bool -R600InstrInfo::SubsumesPredicate(const SmallVectorImpl &Pred1, - const SmallVectorImpl &Pred2) const { - return false; -} - - -bool -R600InstrInfo::PredicateInstruction(MachineInstr *MI, - const SmallVectorImpl &Pred) const { - int PIdx = MI->findFirstPredOperandIdx(); - - if (MI->getOpcode() == AMDGPU::CF_ALU) { - MI->getOperand(8).setImm(0); - return true; - } - - if (MI->getOpcode() == AMDGPU::DOT_4) { - MI->getOperand(getOperandIdx(*MI, AMDGPU::OpName::pred_sel_X)) - .setReg(Pred[2].getReg()); - MI->getOperand(getOperandIdx(*MI, AMDGPU::OpName::pred_sel_Y)) - .setReg(Pred[2].getReg()); - MI->getOperand(getOperandIdx(*MI, AMDGPU::OpName::pred_sel_Z)) - .setReg(Pred[2].getReg()); - MI->getOperand(getOperandIdx(*MI, AMDGPU::OpName::pred_sel_W)) - .setReg(Pred[2].getReg()); - MachineInstrBuilder MIB(*MI->getParent()->getParent(), MI); - MIB.addReg(AMDGPU::PREDICATE_BIT, RegState::Implicit); - return true; - } - - if (PIdx != -1) { - MachineOperand &PMO = MI->getOperand(PIdx); - PMO.setReg(Pred[2].getReg()); - MachineInstrBuilder MIB(*MI->getParent()->getParent(), MI); - MIB.addReg(AMDGPU::PREDICATE_BIT, RegState::Implicit); - return true; - } - - return false; -} - -unsigned int R600InstrInfo::getPredicationCost(const MachineInstr *) const { - return 2; -} - -unsigned int R600InstrInfo::getInstrLatency(const InstrItineraryData *ItinData, - const MachineInstr *MI, - unsigned *PredCost) const { - if (PredCost) - *PredCost = 2; - return 2; -} - -bool R600InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const { - - switch(MI->getOpcode()) { - default: return AMDGPUInstrInfo::expandPostRAPseudo(MI); - case AMDGPU::R600_EXTRACT_ELT_V2: - case AMDGPU::R600_EXTRACT_ELT_V4: - buildIndirectRead(MI->getParent(), MI, MI->getOperand(0).getReg(), - RI.getHWRegIndex(MI->getOperand(1).getReg()), // Address - MI->getOperand(2).getReg(), - RI.getHWRegChan(MI->getOperand(1).getReg())); - break; - case AMDGPU::R600_INSERT_ELT_V2: - case AMDGPU::R600_INSERT_ELT_V4: - buildIndirectWrite(MI->getParent(), MI, MI->getOperand(2).getReg(), // Value - RI.getHWRegIndex(MI->getOperand(1).getReg()), // Address - MI->getOperand(3).getReg(), // Offset - RI.getHWRegChan(MI->getOperand(1).getReg())); // Channel - break; - } - MI->eraseFromParent(); - return true; -} - -void R600InstrInfo::reserveIndirectRegisters(BitVector &Reserved, - const MachineFunction &MF) const { - const AMDGPUFrameLowering *TFL = static_cast( - MF.getSubtarget().getFrameLowering()); - - unsigned StackWidth = TFL->getStackWidth(MF); - int End = getIndirectIndexEnd(MF); - - if (End == -1) - return; - - for (int Index = getIndirectIndexBegin(MF); Index <= End; ++Index) { - unsigned SuperReg = AMDGPU::R600_Reg128RegClass.getRegister(Index); - Reserved.set(SuperReg); - for (unsigned Chan = 0; Chan < StackWidth; ++Chan) { - unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister((4 * Index) + Chan); - Reserved.set(Reg); - } - } -} - -unsigned R600InstrInfo::calculateIndirectAddress(unsigned RegIndex, - unsigned Channel) const { - // XXX: Remove when we support a stack width > 2 - assert(Channel == 0); - return RegIndex; -} - -const TargetRegisterClass *R600InstrInfo::getIndirectAddrRegClass() const { - return &AMDGPU::R600_TReg32_XRegClass; -} - -MachineInstrBuilder R600InstrInfo::buildIndirectWrite(MachineBasicBlock *MBB, - MachineBasicBlock::iterator I, - unsigned ValueReg, unsigned Address, - unsigned OffsetReg) const { - return buildIndirectWrite(MBB, I, ValueReg, Address, OffsetReg, 0); -} - -MachineInstrBuilder R600InstrInfo::buildIndirectWrite(MachineBasicBlock *MBB, - MachineBasicBlock::iterator I, - unsigned ValueReg, unsigned Address, - unsigned OffsetReg, - unsigned AddrChan) const { - unsigned AddrReg; - switch (AddrChan) { - default: llvm_unreachable("Invalid Channel"); - case 0: AddrReg = AMDGPU::R600_AddrRegClass.getRegister(Address); break; - case 1: AddrReg = AMDGPU::R600_Addr_YRegClass.getRegister(Address); break; - case 2: AddrReg = AMDGPU::R600_Addr_ZRegClass.getRegister(Address); break; - case 3: AddrReg = AMDGPU::R600_Addr_WRegClass.getRegister(Address); break; - } - MachineInstr *MOVA = buildDefaultInstruction(*MBB, I, AMDGPU::MOVA_INT_eg, - AMDGPU::AR_X, OffsetReg); - setImmOperand(MOVA, AMDGPU::OpName::write, 0); - - MachineInstrBuilder Mov = buildDefaultInstruction(*MBB, I, AMDGPU::MOV, - AddrReg, ValueReg) - .addReg(AMDGPU::AR_X, - RegState::Implicit | RegState::Kill); - setImmOperand(Mov, AMDGPU::OpName::dst_rel, 1); - return Mov; -} - -MachineInstrBuilder R600InstrInfo::buildIndirectRead(MachineBasicBlock *MBB, - MachineBasicBlock::iterator I, - unsigned ValueReg, unsigned Address, - unsigned OffsetReg) const { - return buildIndirectRead(MBB, I, ValueReg, Address, OffsetReg, 0); -} - -MachineInstrBuilder R600InstrInfo::buildIndirectRead(MachineBasicBlock *MBB, - MachineBasicBlock::iterator I, - unsigned ValueReg, unsigned Address, - unsigned OffsetReg, - unsigned AddrChan) const { - unsigned AddrReg; - switch (AddrChan) { - default: llvm_unreachable("Invalid Channel"); - case 0: AddrReg = AMDGPU::R600_AddrRegClass.getRegister(Address); break; - case 1: AddrReg = AMDGPU::R600_Addr_YRegClass.getRegister(Address); break; - case 2: AddrReg = AMDGPU::R600_Addr_ZRegClass.getRegister(Address); break; - case 3: AddrReg = AMDGPU::R600_Addr_WRegClass.getRegister(Address); break; - } - MachineInstr *MOVA = buildDefaultInstruction(*MBB, I, AMDGPU::MOVA_INT_eg, - AMDGPU::AR_X, - OffsetReg); - setImmOperand(MOVA, AMDGPU::OpName::write, 0); - MachineInstrBuilder Mov = buildDefaultInstruction(*MBB, I, AMDGPU::MOV, - ValueReg, - AddrReg) - .addReg(AMDGPU::AR_X, - RegState::Implicit | RegState::Kill); - setImmOperand(Mov, AMDGPU::OpName::src0_rel, 1); - - return Mov; -} - -unsigned R600InstrInfo::getMaxAlusPerClause() const { - return 115; -} - -MachineInstrBuilder R600InstrInfo::buildDefaultInstruction(MachineBasicBlock &MBB, - MachineBasicBlock::iterator I, - unsigned Opcode, - unsigned DstReg, - unsigned Src0Reg, - unsigned Src1Reg) const { - MachineInstrBuilder MIB = BuildMI(MBB, I, MBB.findDebugLoc(I), get(Opcode), - DstReg); // $dst - - if (Src1Reg) { - MIB.addImm(0) // $update_exec_mask - .addImm(0); // $update_predicate - } - MIB.addImm(1) // $write - .addImm(0) // $omod - .addImm(0) // $dst_rel - .addImm(0) // $dst_clamp - .addReg(Src0Reg) // $src0 - .addImm(0) // $src0_neg - .addImm(0) // $src0_rel - .addImm(0) // $src0_abs - .addImm(-1); // $src0_sel - - if (Src1Reg) { - MIB.addReg(Src1Reg) // $src1 - .addImm(0) // $src1_neg - .addImm(0) // $src1_rel - .addImm(0) // $src1_abs - .addImm(-1); // $src1_sel - } - - //XXX: The r600g finalizer expects this to be 1, once we've moved the - //scheduling to the backend, we can change the default to 0. - MIB.addImm(1) // $last - .addReg(AMDGPU::PRED_SEL_OFF) // $pred_sel - .addImm(0) // $literal - .addImm(0); // $bank_swizzle - - return MIB; -} - -#define OPERAND_CASE(Label) \ - case Label: { \ - static const unsigned Ops[] = \ - { \ - Label##_X, \ - Label##_Y, \ - Label##_Z, \ - Label##_W \ - }; \ - return Ops[Slot]; \ - } - -static unsigned getSlotedOps(unsigned Op, unsigned Slot) { - switch (Op) { - OPERAND_CASE(AMDGPU::OpName::update_exec_mask) - OPERAND_CASE(AMDGPU::OpName::update_pred) - OPERAND_CASE(AMDGPU::OpName::write) - OPERAND_CASE(AMDGPU::OpName::omod) - OPERAND_CASE(AMDGPU::OpName::dst_rel) - OPERAND_CASE(AMDGPU::OpName::clamp) - OPERAND_CASE(AMDGPU::OpName::src0) - OPERAND_CASE(AMDGPU::OpName::src0_neg) - OPERAND_CASE(AMDGPU::OpName::src0_rel) - OPERAND_CASE(AMDGPU::OpName::src0_abs) - OPERAND_CASE(AMDGPU::OpName::src0_sel) - OPERAND_CASE(AMDGPU::OpName::src1) - OPERAND_CASE(AMDGPU::OpName::src1_neg) - OPERAND_CASE(AMDGPU::OpName::src1_rel) - OPERAND_CASE(AMDGPU::OpName::src1_abs) - OPERAND_CASE(AMDGPU::OpName::src1_sel) - OPERAND_CASE(AMDGPU::OpName::pred_sel) - default: - llvm_unreachable("Wrong Operand"); - } -} - -#undef OPERAND_CASE - -MachineInstr *R600InstrInfo::buildSlotOfVectorInstruction( - MachineBasicBlock &MBB, MachineInstr *MI, unsigned Slot, unsigned DstReg) - const { - assert (MI->getOpcode() == AMDGPU::DOT_4 && "Not Implemented"); - unsigned Opcode; - if (ST.getGeneration() <= AMDGPUSubtarget::R700) - Opcode = AMDGPU::DOT4_r600; - else - Opcode = AMDGPU::DOT4_eg; - MachineBasicBlock::iterator I = MI; - MachineOperand &Src0 = MI->getOperand( - getOperandIdx(MI->getOpcode(), getSlotedOps(AMDGPU::OpName::src0, Slot))); - MachineOperand &Src1 = MI->getOperand( - getOperandIdx(MI->getOpcode(), getSlotedOps(AMDGPU::OpName::src1, Slot))); - MachineInstr *MIB = buildDefaultInstruction( - MBB, I, Opcode, DstReg, Src0.getReg(), Src1.getReg()); - static const unsigned Operands[14] = { - AMDGPU::OpName::update_exec_mask, - AMDGPU::OpName::update_pred, - AMDGPU::OpName::write, - AMDGPU::OpName::omod, - AMDGPU::OpName::dst_rel, - AMDGPU::OpName::clamp, - AMDGPU::OpName::src0_neg, - AMDGPU::OpName::src0_rel, - AMDGPU::OpName::src0_abs, - AMDGPU::OpName::src0_sel, - AMDGPU::OpName::src1_neg, - AMDGPU::OpName::src1_rel, - AMDGPU::OpName::src1_abs, - AMDGPU::OpName::src1_sel, - }; - - MachineOperand &MO = MI->getOperand(getOperandIdx(MI->getOpcode(), - getSlotedOps(AMDGPU::OpName::pred_sel, Slot))); - MIB->getOperand(getOperandIdx(Opcode, AMDGPU::OpName::pred_sel)) - .setReg(MO.getReg()); - - for (unsigned i = 0; i < 14; i++) { - MachineOperand &MO = MI->getOperand( - getOperandIdx(MI->getOpcode(), getSlotedOps(Operands[i], Slot))); - assert (MO.isImm()); - setImmOperand(MIB, Operands[i], MO.getImm()); - } - MIB->getOperand(20).setImm(0); - return MIB; -} - -MachineInstr *R600InstrInfo::buildMovImm(MachineBasicBlock &BB, - MachineBasicBlock::iterator I, - unsigned DstReg, - uint64_t Imm) const { - MachineInstr *MovImm = buildDefaultInstruction(BB, I, AMDGPU::MOV, DstReg, - AMDGPU::ALU_LITERAL_X); - setImmOperand(MovImm, AMDGPU::OpName::literal, Imm); - return MovImm; -} - -MachineInstr *R600InstrInfo::buildMovInstr(MachineBasicBlock *MBB, - MachineBasicBlock::iterator I, - unsigned DstReg, unsigned SrcReg) const { - return buildDefaultInstruction(*MBB, I, AMDGPU::MOV, DstReg, SrcReg); -} - -int R600InstrInfo::getOperandIdx(const MachineInstr &MI, unsigned Op) const { - return getOperandIdx(MI.getOpcode(), Op); -} - -int R600InstrInfo::getOperandIdx(unsigned Opcode, unsigned Op) const { - return AMDGPU::getNamedOperandIdx(Opcode, Op); -} - -void R600InstrInfo::setImmOperand(MachineInstr *MI, unsigned Op, - int64_t Imm) const { - int Idx = getOperandIdx(*MI, Op); - assert(Idx != -1 && "Operand not supported for this instruction."); - assert(MI->getOperand(Idx).isImm()); - MI->getOperand(Idx).setImm(Imm); -} - -//===----------------------------------------------------------------------===// -// Instruction flag getters/setters -//===----------------------------------------------------------------------===// - -bool R600InstrInfo::hasFlagOperand(const MachineInstr &MI) const { - return GET_FLAG_OPERAND_IDX(get(MI.getOpcode()).TSFlags) != 0; -} - -MachineOperand &R600InstrInfo::getFlagOp(MachineInstr *MI, unsigned SrcIdx, - unsigned Flag) const { - unsigned TargetFlags = get(MI->getOpcode()).TSFlags; - int FlagIndex = 0; - if (Flag != 0) { - // If we pass something other than the default value of Flag to this - // function, it means we are want to set a flag on an instruction - // that uses native encoding. - assert(HAS_NATIVE_OPERANDS(TargetFlags)); - bool IsOP3 = (TargetFlags & R600_InstFlag::OP3) == R600_InstFlag::OP3; - switch (Flag) { - case MO_FLAG_CLAMP: - FlagIndex = getOperandIdx(*MI, AMDGPU::OpName::clamp); - break; - case MO_FLAG_MASK: - FlagIndex = getOperandIdx(*MI, AMDGPU::OpName::write); - break; - case MO_FLAG_NOT_LAST: - case MO_FLAG_LAST: - FlagIndex = getOperandIdx(*MI, AMDGPU::OpName::last); - break; - case MO_FLAG_NEG: - switch (SrcIdx) { - case 0: FlagIndex = getOperandIdx(*MI, AMDGPU::OpName::src0_neg); break; - case 1: FlagIndex = getOperandIdx(*MI, AMDGPU::OpName::src1_neg); break; - case 2: FlagIndex = getOperandIdx(*MI, AMDGPU::OpName::src2_neg); break; - } - break; - - case MO_FLAG_ABS: - assert(!IsOP3 && "Cannot set absolute value modifier for OP3 " - "instructions."); - (void)IsOP3; - switch (SrcIdx) { - case 0: FlagIndex = getOperandIdx(*MI, AMDGPU::OpName::src0_abs); break; - case 1: FlagIndex = getOperandIdx(*MI, AMDGPU::OpName::src1_abs); break; - } - break; - - default: - FlagIndex = -1; - break; - } - assert(FlagIndex != -1 && "Flag not supported for this instruction"); - } else { - FlagIndex = GET_FLAG_OPERAND_IDX(TargetFlags); - assert(FlagIndex != 0 && - "Instruction flags not supported for this instruction"); - } - - MachineOperand &FlagOp = MI->getOperand(FlagIndex); - assert(FlagOp.isImm()); - return FlagOp; -} - -void R600InstrInfo::addFlag(MachineInstr *MI, unsigned Operand, - unsigned Flag) const { - unsigned TargetFlags = get(MI->getOpcode()).TSFlags; - if (Flag == 0) { - return; - } - if (HAS_NATIVE_OPERANDS(TargetFlags)) { - MachineOperand &FlagOp = getFlagOp(MI, Operand, Flag); - if (Flag == MO_FLAG_NOT_LAST) { - clearFlag(MI, Operand, MO_FLAG_LAST); - } else if (Flag == MO_FLAG_MASK) { - clearFlag(MI, Operand, Flag); - } else { - FlagOp.setImm(1); - } - } else { - MachineOperand &FlagOp = getFlagOp(MI, Operand); - FlagOp.setImm(FlagOp.getImm() | (Flag << (NUM_MO_FLAGS * Operand))); - } -} - -void R600InstrInfo::clearFlag(MachineInstr *MI, unsigned Operand, - unsigned Flag) const { - unsigned TargetFlags = get(MI->getOpcode()).TSFlags; - if (HAS_NATIVE_OPERANDS(TargetFlags)) { - MachineOperand &FlagOp = getFlagOp(MI, Operand, Flag); - FlagOp.setImm(0); - } else { - MachineOperand &FlagOp = getFlagOp(MI); - unsigned InstFlags = FlagOp.getImm(); - InstFlags &= ~(Flag << (NUM_MO_FLAGS * Operand)); - FlagOp.setImm(InstFlags); - } -} Index: projects/clang370-import/contrib/llvm/lib/Target/R600/SIRegisterInfo.td =================================================================== --- projects/clang370-import/contrib/llvm/lib/Target/R600/SIRegisterInfo.td (revision 288077) +++ projects/clang370-import/contrib/llvm/lib/Target/R600/SIRegisterInfo.td (nonexistent) @@ -1,284 +0,0 @@ -//===-- SIRegisterInfo.td - SI Register defs ---------------*- tablegen -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// - -//===----------------------------------------------------------------------===// -// Declarations that describe the SI registers -//===----------------------------------------------------------------------===// - -class SIReg encoding = 0> : Register { - let Namespace = "AMDGPU"; - let HWEncoding = encoding; -} - -// Special Registers -def VCC_LO : SIReg<"vcc_lo", 106>; -def VCC_HI : SIReg<"vcc_hi", 107>; - -// VCC for 64-bit instructions -def VCC : RegisterWithSubRegs<"vcc", [VCC_LO, VCC_HI]> { - let Namespace = "AMDGPU"; - let SubRegIndices = [sub0, sub1]; - let HWEncoding = 106; -} - -def EXEC_LO : SIReg<"exec_lo", 126>; -def EXEC_HI : SIReg<"exec_hi", 127>; - -def EXEC : RegisterWithSubRegs<"EXEC", [EXEC_LO, EXEC_HI]> { - let Namespace = "AMDGPU"; - let SubRegIndices = [sub0, sub1]; - let HWEncoding = 126; -} - -def SCC : SIReg<"scc", 253>; -def M0 : SIReg <"m0", 124>; - -def FLAT_SCR_LO : SIReg<"flat_scr_lo", 104>; // Offset in units of 256-bytes. -def FLAT_SCR_HI : SIReg<"flat_scr_hi", 105>; // Size is the per-thread scratch size, in bytes. - -// Pair to indicate location of scratch space for flat accesses. -def FLAT_SCR : RegisterWithSubRegs <"flat_scr", [FLAT_SCR_LO, FLAT_SCR_HI]> { - let Namespace = "AMDGPU"; - let SubRegIndices = [sub0, sub1]; - let HWEncoding = 104; -} - -// SGPR registers -foreach Index = 0-101 in { - def SGPR#Index : SIReg <"SGPR"#Index, Index>; -} - -// VGPR registers -foreach Index = 0-255 in { - def VGPR#Index : SIReg <"VGPR"#Index, Index> { - let HWEncoding{8} = 1; - } -} - -//===----------------------------------------------------------------------===// -// Groupings using register classes and tuples -//===----------------------------------------------------------------------===// - -// SGPR 32-bit registers -def SGPR_32 : RegisterClass<"AMDGPU", [i32, f32], 32, - (add (sequence "SGPR%u", 0, 101))>; - -// SGPR 64-bit registers -def SGPR_64Regs : RegisterTuples<[sub0, sub1], - [(add (decimate (trunc SGPR_32, 101), 2)), - (add (decimate (shl SGPR_32, 1), 2))]>; - -// SGPR 128-bit registers -def SGPR_128 : RegisterTuples<[sub0, sub1, sub2, sub3], - [(add (decimate (trunc SGPR_32, 99), 4)), - (add (decimate (shl SGPR_32, 1), 4)), - (add (decimate (shl SGPR_32, 2), 4)), - (add (decimate (shl SGPR_32, 3), 4))]>; - -// SGPR 256-bit registers -def SGPR_256 : RegisterTuples<[sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7], - [(add (decimate (trunc SGPR_32, 95), 4)), - (add (decimate (shl SGPR_32, 1), 4)), - (add (decimate (shl SGPR_32, 2), 4)), - (add (decimate (shl SGPR_32, 3), 4)), - (add (decimate (shl SGPR_32, 4), 4)), - (add (decimate (shl SGPR_32, 5), 4)), - (add (decimate (shl SGPR_32, 6), 4)), - (add (decimate (shl SGPR_32, 7), 4))]>; - -// SGPR 512-bit registers -def SGPR_512 : RegisterTuples<[sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7, - sub8, sub9, sub10, sub11, sub12, sub13, sub14, sub15], - [(add (decimate (trunc SGPR_32, 87), 4)), - (add (decimate (shl SGPR_32, 1), 4)), - (add (decimate (shl SGPR_32, 2), 4)), - (add (decimate (shl SGPR_32, 3), 4)), - (add (decimate (shl SGPR_32, 4), 4)), - (add (decimate (shl SGPR_32, 5), 4)), - (add (decimate (shl SGPR_32, 6), 4)), - (add (decimate (shl SGPR_32, 7), 4)), - (add (decimate (shl SGPR_32, 8), 4)), - (add (decimate (shl SGPR_32, 9), 4)), - (add (decimate (shl SGPR_32, 10), 4)), - (add (decimate (shl SGPR_32, 11), 4)), - (add (decimate (shl SGPR_32, 12), 4)), - (add (decimate (shl SGPR_32, 13), 4)), - (add (decimate (shl SGPR_32, 14), 4)), - (add (decimate (shl SGPR_32, 15), 4))]>; - -// VGPR 32-bit registers -def VGPR_32 : RegisterClass<"AMDGPU", [i32, f32], 32, - (add (sequence "VGPR%u", 0, 255))>; - -// VGPR 64-bit registers -def VGPR_64 : RegisterTuples<[sub0, sub1], - [(add (trunc VGPR_32, 255)), - (add (shl VGPR_32, 1))]>; - -// VGPR 96-bit registers -def VGPR_96 : RegisterTuples<[sub0, sub1, sub2], - [(add (trunc VGPR_32, 254)), - (add (shl VGPR_32, 1)), - (add (shl VGPR_32, 2))]>; - -// VGPR 128-bit registers -def VGPR_128 : RegisterTuples<[sub0, sub1, sub2, sub3], - [(add (trunc VGPR_32, 253)), - (add (shl VGPR_32, 1)), - (add (shl VGPR_32, 2)), - (add (shl VGPR_32, 3))]>; - -// VGPR 256-bit registers -def VGPR_256 : RegisterTuples<[sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7], - [(add (trunc VGPR_32, 249)), - (add (shl VGPR_32, 1)), - (add (shl VGPR_32, 2)), - (add (shl VGPR_32, 3)), - (add (shl VGPR_32, 4)), - (add (shl VGPR_32, 5)), - (add (shl VGPR_32, 6)), - (add (shl VGPR_32, 7))]>; - -// VGPR 512-bit registers -def VGPR_512 : RegisterTuples<[sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7, - sub8, sub9, sub10, sub11, sub12, sub13, sub14, sub15], - [(add (trunc VGPR_32, 241)), - (add (shl VGPR_32, 1)), - (add (shl VGPR_32, 2)), - (add (shl VGPR_32, 3)), - (add (shl VGPR_32, 4)), - (add (shl VGPR_32, 5)), - (add (shl VGPR_32, 6)), - (add (shl VGPR_32, 7)), - (add (shl VGPR_32, 8)), - (add (shl VGPR_32, 9)), - (add (shl VGPR_32, 10)), - (add (shl VGPR_32, 11)), - (add (shl VGPR_32, 12)), - (add (shl VGPR_32, 13)), - (add (shl VGPR_32, 14)), - (add (shl VGPR_32, 15))]>; - -//===----------------------------------------------------------------------===// -// Register classes used as source and destination -//===----------------------------------------------------------------------===// - -class RegImmMatcher : AsmOperandClass { - let Name = name; - let RenderMethod = "addRegOrImmOperands"; -} - -// Special register classes for predicates and the M0 register -def SCCReg : RegisterClass<"AMDGPU", [i32, i1], 32, (add SCC)> { - let CopyCost = -1; // Theoretically it is possible to read from SCC, - // but it should never be necessary. -} - -def VCCReg : RegisterClass<"AMDGPU", [i64, i1], 64, (add VCC)>; -def EXECReg : RegisterClass<"AMDGPU", [i64, i1], 64, (add EXEC)>; - -// Register class for all scalar registers (SGPRs + Special Registers) -def SReg_32 : RegisterClass<"AMDGPU", [i32, f32], 32, - (add SGPR_32, M0, VCC_LO, VCC_HI, EXEC_LO, EXEC_HI, FLAT_SCR_LO, FLAT_SCR_HI) ->; - -def SGPR_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64], 64, (add SGPR_64Regs)>; - -def SReg_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64, i1], 64, - (add SGPR_64, VCCReg, EXECReg, FLAT_SCR) ->; - -def SReg_128 : RegisterClass<"AMDGPU", [v4i32, v16i8], 128, (add SGPR_128)>; - -def SReg_256 : RegisterClass<"AMDGPU", [v32i8, v8i32, v8f32], 256, (add SGPR_256)>; - -def SReg_512 : RegisterClass<"AMDGPU", [v64i8, v16i32], 512, (add SGPR_512)>; - -// Register class for all vector registers (VGPRs + Interploation Registers) -def VReg_64 : RegisterClass<"AMDGPU", [i64, f64, v2i32, v2f32], 64, (add VGPR_64)>; - -def VReg_96 : RegisterClass<"AMDGPU", [untyped], 96, (add VGPR_96)> { - let Size = 96; -} - -def VReg_128 : RegisterClass<"AMDGPU", [v4i32, v4f32], 128, (add VGPR_128)>; - -def VReg_256 : RegisterClass<"AMDGPU", [v32i8, v8i32, v8f32], 256, (add VGPR_256)>; - -def VReg_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 512, (add VGPR_512)>; - -def VReg_1 : RegisterClass<"AMDGPU", [i1], 32, (add VGPR_32)> { - let Size = 32; -} - -class RegImmOperand : RegisterOperand { - let OperandNamespace = "AMDGPU"; - let OperandType = "OPERAND_REG_IMM32"; -} - -class RegInlineOperand : RegisterOperand { - let OperandNamespace = "AMDGPU"; - let OperandType = "OPERAND_REG_INLINE_C"; -} - -//===----------------------------------------------------------------------===// -// SSrc_* Operands with an SGPR or a 32-bit immediate -//===----------------------------------------------------------------------===// - -def SSrc_32 : RegImmOperand { - let ParserMatchClass = RegImmMatcher<"SSrc32">; -} - -def SSrc_64 : RegImmOperand { - let ParserMatchClass = RegImmMatcher<"SSrc64">; -} - -//===----------------------------------------------------------------------===// -// SCSrc_* Operands with an SGPR or a inline constant -//===----------------------------------------------------------------------===// - -def SCSrc_32 : RegInlineOperand { - let ParserMatchClass = RegImmMatcher<"SCSrc32">; -} - -//===----------------------------------------------------------------------===// -// VSrc_* Operands with an SGPR, VGPR or a 32-bit immediate -//===----------------------------------------------------------------------===// - -def VS_32 : RegisterClass<"AMDGPU", [i32, f32], 32, (add VGPR_32, SReg_32)>; - -def VS_64 : RegisterClass<"AMDGPU", [i64, f64], 64, (add VReg_64, SReg_64)>; - -def VSrc_32 : RegisterOperand { - let OperandNamespace = "AMDGPU"; - let OperandType = "OPERAND_REG_IMM32"; - let ParserMatchClass = RegImmMatcher<"VSrc32">; -} - -def VSrc_64 : RegisterOperand { - let OperandNamespace = "AMDGPU"; - let OperandType = "OPERAND_REG_IMM32"; - let ParserMatchClass = RegImmMatcher<"VSrc64">; -} - -//===----------------------------------------------------------------------===// -// VCSrc_* Operands with an SGPR, VGPR or an inline constant -//===----------------------------------------------------------------------===// - -def VCSrc_32 : RegisterOperand { - let OperandNamespace = "AMDGPU"; - let OperandType = "OPERAND_REG_INLINE_C"; - let ParserMatchClass = RegImmMatcher<"VCSrc32">; -} - -def VCSrc_64 : RegisterOperand { - let OperandNamespace = "AMDGPU"; - let OperandType = "OPERAND_REG_INLINE_C"; - let ParserMatchClass = RegImmMatcher<"VCSrc64">; -} Index: projects/clang370-import/contrib/llvm/lib/Target/R600/SILoadStoreOptimizer.cpp =================================================================== --- projects/clang370-import/contrib/llvm/lib/Target/R600/SILoadStoreOptimizer.cpp (revision 288077) +++ projects/clang370-import/contrib/llvm/lib/Target/R600/SILoadStoreOptimizer.cpp (nonexistent) @@ -1,421 +0,0 @@ -//===-- SILoadStoreOptimizer.cpp ------------------------------------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This pass tries to fuse DS instructions with close by immediate offsets. -// This will fuse operations such as -// ds_read_b32 v0, v2 offset:16 -// ds_read_b32 v1, v2 offset:32 -// ==> -// ds_read2_b32 v[0:1], v2, offset0:4 offset1:8 -// -// -// Future improvements: -// -// - This currently relies on the scheduler to place loads and stores next to -// each other, and then only merges adjacent pairs of instructions. It would -// be good to be more flexible with interleaved instructions, and possibly run -// before scheduling. It currently missing stores of constants because loading -// the constant into the data register is placed between the stores, although -// this is arguably a scheduling problem. -// -// - Live interval recomputing seems inefficient. This currently only matches -// one pair, and recomputes live intervals and moves on to the next pair. It -// would be better to compute a list of all merges that need to occur -// -// - With a list of instructions to process, we can also merge more. If a -// cluster of loads have offsets that are too large to fit in the 8-bit -// offsets, but are close enough to fit in the 8 bits, we can add to the base -// pointer and use the new reduced offsets. -// -//===----------------------------------------------------------------------===// - -#include "AMDGPU.h" -#include "SIInstrInfo.h" -#include "SIRegisterInfo.h" -#include "llvm/CodeGen/LiveIntervalAnalysis.h" -#include "llvm/CodeGen/LiveVariables.h" -#include "llvm/CodeGen/MachineFunction.h" -#include "llvm/CodeGen/MachineFunctionPass.h" -#include "llvm/CodeGen/MachineInstrBuilder.h" -#include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/Support/Debug.h" -#include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetMachine.h" - -using namespace llvm; - -#define DEBUG_TYPE "si-load-store-opt" - -namespace { - -class SILoadStoreOptimizer : public MachineFunctionPass { -private: - const SIInstrInfo *TII; - const SIRegisterInfo *TRI; - MachineRegisterInfo *MRI; - LiveIntervals *LIS; - - - static bool offsetsCanBeCombined(unsigned Offset0, - unsigned Offset1, - unsigned EltSize); - - MachineBasicBlock::iterator findMatchingDSInst(MachineBasicBlock::iterator I, - unsigned EltSize); - - void updateRegDefsUses(unsigned SrcReg, - unsigned DstReg, - unsigned SubIdx); - - MachineBasicBlock::iterator mergeRead2Pair( - MachineBasicBlock::iterator I, - MachineBasicBlock::iterator Paired, - unsigned EltSize); - - MachineBasicBlock::iterator mergeWrite2Pair( - MachineBasicBlock::iterator I, - MachineBasicBlock::iterator Paired, - unsigned EltSize); - -public: - static char ID; - - SILoadStoreOptimizer() - : MachineFunctionPass(ID), TII(nullptr), TRI(nullptr), MRI(nullptr), - LIS(nullptr) {} - - SILoadStoreOptimizer(const TargetMachine &TM_) : MachineFunctionPass(ID) { - initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry()); - } - - bool optimizeBlock(MachineBasicBlock &MBB); - - bool runOnMachineFunction(MachineFunction &MF) override; - - const char *getPassName() const override { - return "SI Load / Store Optimizer"; - } - - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.setPreservesCFG(); - AU.addPreserved(); - AU.addPreserved(); - AU.addPreserved(); - AU.addRequired(); - - MachineFunctionPass::getAnalysisUsage(AU); - } -}; - -} // End anonymous namespace. - -INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE, - "SI Load / Store Optimizer", false, false) -INITIALIZE_PASS_DEPENDENCY(LiveIntervals) -INITIALIZE_PASS_DEPENDENCY(LiveVariables) -INITIALIZE_PASS_DEPENDENCY(SlotIndexes) -INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE, - "SI Load / Store Optimizer", false, false) - -char SILoadStoreOptimizer::ID = 0; - -char &llvm::SILoadStoreOptimizerID = SILoadStoreOptimizer::ID; - -FunctionPass *llvm::createSILoadStoreOptimizerPass(TargetMachine &TM) { - return new SILoadStoreOptimizer(TM); -} - -bool SILoadStoreOptimizer::offsetsCanBeCombined(unsigned Offset0, - unsigned Offset1, - unsigned Size) { - // XXX - Would the same offset be OK? Is there any reason this would happen or - // be useful? - if (Offset0 == Offset1) - return false; - - // This won't be valid if the offset isn't aligned. - if ((Offset0 % Size != 0) || (Offset1 % Size != 0)) - return false; - - unsigned EltOffset0 = Offset0 / Size; - unsigned EltOffset1 = Offset1 / Size; - - // Check if the new offsets fit in the reduced 8-bit range. - if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1)) - return true; - - // If the offset in elements doesn't fit in 8-bits, we might be able to use - // the stride 64 versions. - if ((EltOffset0 % 64 != 0) || (EltOffset1 % 64) != 0) - return false; - - return isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64); -} - -MachineBasicBlock::iterator -SILoadStoreOptimizer::findMatchingDSInst(MachineBasicBlock::iterator I, - unsigned EltSize){ - MachineBasicBlock::iterator E = I->getParent()->end(); - MachineBasicBlock::iterator MBBI = I; - ++MBBI; - - if (MBBI->getOpcode() != I->getOpcode()) - return E; - - // Don't merge volatiles. - if (MBBI->hasOrderedMemoryRef()) - return E; - - int AddrIdx = AMDGPU::getNamedOperandIdx(I->getOpcode(), AMDGPU::OpName::addr); - const MachineOperand &AddrReg0 = I->getOperand(AddrIdx); - const MachineOperand &AddrReg1 = MBBI->getOperand(AddrIdx); - - // Check same base pointer. Be careful of subregisters, which can occur with - // vectors of pointers. - if (AddrReg0.getReg() == AddrReg1.getReg() && - AddrReg0.getSubReg() == AddrReg1.getSubReg()) { - int OffsetIdx = AMDGPU::getNamedOperandIdx(I->getOpcode(), - AMDGPU::OpName::offset); - unsigned Offset0 = I->getOperand(OffsetIdx).getImm() & 0xffff; - unsigned Offset1 = MBBI->getOperand(OffsetIdx).getImm() & 0xffff; - - // Check both offsets fit in the reduced range. - if (offsetsCanBeCombined(Offset0, Offset1, EltSize)) - return MBBI; - } - - return E; -} - -void SILoadStoreOptimizer::updateRegDefsUses(unsigned SrcReg, - unsigned DstReg, - unsigned SubIdx) { - for (MachineRegisterInfo::reg_iterator I = MRI->reg_begin(SrcReg), - E = MRI->reg_end(); I != E; ) { - MachineOperand &O = *I; - ++I; - O.substVirtReg(DstReg, SubIdx, *TRI); - } -} - -MachineBasicBlock::iterator SILoadStoreOptimizer::mergeRead2Pair( - MachineBasicBlock::iterator I, - MachineBasicBlock::iterator Paired, - unsigned EltSize) { - MachineBasicBlock *MBB = I->getParent(); - - // Be careful, since the addresses could be subregisters themselves in weird - // cases, like vectors of pointers. - const MachineOperand *AddrReg = TII->getNamedOperand(*I, AMDGPU::OpName::addr); - - unsigned DestReg0 = TII->getNamedOperand(*I, AMDGPU::OpName::vdst)->getReg(); - unsigned DestReg1 - = TII->getNamedOperand(*Paired, AMDGPU::OpName::vdst)->getReg(); - - unsigned Offset0 - = TII->getNamedOperand(*I, AMDGPU::OpName::offset)->getImm() & 0xffff; - unsigned Offset1 - = TII->getNamedOperand(*Paired, AMDGPU::OpName::offset)->getImm() & 0xffff; - - unsigned NewOffset0 = Offset0 / EltSize; - unsigned NewOffset1 = Offset1 / EltSize; - unsigned Opc = (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64; - - // Prefer the st64 form if we can use it, even if we can fit the offset in the - // non st64 version. I'm not sure if there's any real reason to do this. - bool UseST64 = (NewOffset0 % 64 == 0) && (NewOffset1 % 64 == 0); - if (UseST64) { - NewOffset0 /= 64; - NewOffset1 /= 64; - Opc = (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64; - } - - assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) && - (NewOffset0 != NewOffset1) && - "Computed offset doesn't fit"); - - const MCInstrDesc &Read2Desc = TII->get(Opc); - - const TargetRegisterClass *SuperRC - = (EltSize == 4) ? &AMDGPU::VReg_64RegClass : &AMDGPU::VReg_128RegClass; - unsigned DestReg = MRI->createVirtualRegister(SuperRC); - - DebugLoc DL = I->getDebugLoc(); - MachineInstrBuilder Read2 - = BuildMI(*MBB, I, DL, Read2Desc, DestReg) - .addOperand(*AddrReg) // addr - .addImm(NewOffset0) // offset0 - .addImm(NewOffset1) // offset1 - .addImm(0) // gds - .addMemOperand(*I->memoperands_begin()) - .addMemOperand(*Paired->memoperands_begin()); - - unsigned SubRegIdx0 = (EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1; - unsigned SubRegIdx1 = (EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3; - updateRegDefsUses(DestReg0, DestReg, SubRegIdx0); - updateRegDefsUses(DestReg1, DestReg, SubRegIdx1); - - LIS->RemoveMachineInstrFromMaps(I); - // Replacing Paired in the maps with Read2 allows us to avoid updating the - // live range for the m0 register. - LIS->ReplaceMachineInstrInMaps(Paired, Read2); - I->eraseFromParent(); - Paired->eraseFromParent(); - - LiveInterval &AddrRegLI = LIS->getInterval(AddrReg->getReg()); - LIS->shrinkToUses(&AddrRegLI); - - LIS->getInterval(DestReg); // Create new LI - - DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n'); - return Read2.getInstr(); -} - -MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair( - MachineBasicBlock::iterator I, - MachineBasicBlock::iterator Paired, - unsigned EltSize) { - MachineBasicBlock *MBB = I->getParent(); - - // Be sure to use .addOperand(), and not .addReg() with these. We want to be - // sure we preserve the subregister index and any register flags set on them. - const MachineOperand *Addr = TII->getNamedOperand(*I, AMDGPU::OpName::addr); - const MachineOperand *Data0 = TII->getNamedOperand(*I, AMDGPU::OpName::data0); - const MachineOperand *Data1 - = TII->getNamedOperand(*Paired, AMDGPU::OpName::data0); - - - unsigned Offset0 - = TII->getNamedOperand(*I, AMDGPU::OpName::offset)->getImm() & 0xffff; - unsigned Offset1 - = TII->getNamedOperand(*Paired, AMDGPU::OpName::offset)->getImm() & 0xffff; - - unsigned NewOffset0 = Offset0 / EltSize; - unsigned NewOffset1 = Offset1 / EltSize; - unsigned Opc = (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64; - - // Prefer the st64 form if we can use it, even if we can fit the offset in the - // non st64 version. I'm not sure if there's any real reason to do this. - bool UseST64 = (NewOffset0 % 64 == 0) && (NewOffset1 % 64 == 0); - if (UseST64) { - NewOffset0 /= 64; - NewOffset1 /= 64; - Opc = (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32 : AMDGPU::DS_WRITE2ST64_B64; - } - - assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) && - (NewOffset0 != NewOffset1) && - "Computed offset doesn't fit"); - - const MCInstrDesc &Write2Desc = TII->get(Opc); - DebugLoc DL = I->getDebugLoc(); - - // repairLiveintervalsInRange() doesn't handle physical register, so we have - // to update the M0 range manually. - SlotIndex PairedIndex = LIS->getInstructionIndex(Paired); - LiveRange &M0Range = LIS->getRegUnit(*MCRegUnitIterator(AMDGPU::M0, TRI)); - LiveRange::Segment *M0Segment = M0Range.getSegmentContaining(PairedIndex); - bool UpdateM0Range = M0Segment->end == PairedIndex.getRegSlot(); - - MachineInstrBuilder Write2 - = BuildMI(*MBB, I, DL, Write2Desc) - .addOperand(*Addr) // addr - .addOperand(*Data0) // data0 - .addOperand(*Data1) // data1 - .addImm(NewOffset0) // offset0 - .addImm(NewOffset1) // offset1 - .addImm(0) // gds - .addMemOperand(*I->memoperands_begin()) - .addMemOperand(*Paired->memoperands_begin()); - - // XXX - How do we express subregisters here? - unsigned OrigRegs[] = { Data0->getReg(), Data1->getReg(), Addr->getReg() }; - - LIS->RemoveMachineInstrFromMaps(I); - LIS->RemoveMachineInstrFromMaps(Paired); - I->eraseFromParent(); - Paired->eraseFromParent(); - - // This doesn't handle physical registers like M0 - LIS->repairIntervalsInRange(MBB, Write2, Write2, OrigRegs); - - if (UpdateM0Range) { - SlotIndex Write2Index = LIS->getInstructionIndex(Write2); - M0Segment->end = Write2Index.getRegSlot(); - } - - DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n'); - return Write2.getInstr(); -} - -// Scan through looking for adjacent LDS operations with constant offsets from -// the same base register. We rely on the scheduler to do the hard work of -// clustering nearby loads, and assume these are all adjacent. -bool SILoadStoreOptimizer::optimizeBlock(MachineBasicBlock &MBB) { - bool Modified = false; - - for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;) { - MachineInstr &MI = *I; - - // Don't combine if volatile. - if (MI.hasOrderedMemoryRef()) { - ++I; - continue; - } - - unsigned Opc = MI.getOpcode(); - if (Opc == AMDGPU::DS_READ_B32 || Opc == AMDGPU::DS_READ_B64) { - unsigned Size = (Opc == AMDGPU::DS_READ_B64) ? 8 : 4; - MachineBasicBlock::iterator Match = findMatchingDSInst(I, Size); - if (Match != E) { - Modified = true; - I = mergeRead2Pair(I, Match, Size); - } else { - ++I; - } - - continue; - } else if (Opc == AMDGPU::DS_WRITE_B32 || Opc == AMDGPU::DS_WRITE_B64) { - unsigned Size = (Opc == AMDGPU::DS_WRITE_B64) ? 8 : 4; - MachineBasicBlock::iterator Match = findMatchingDSInst(I, Size); - if (Match != E) { - Modified = true; - I = mergeWrite2Pair(I, Match, Size); - } else { - ++I; - } - - continue; - } - - ++I; - } - - return Modified; -} - -bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) { - const TargetSubtargetInfo &STM = MF.getSubtarget(); - TRI = static_cast(STM.getRegisterInfo()); - TII = static_cast(STM.getInstrInfo()); - MRI = &MF.getRegInfo(); - - LIS = &getAnalysis(); - - DEBUG(dbgs() << "Running SILoadStoreOptimizer\n"); - - assert(!MRI->isSSA()); - - bool Modified = false; - - for (MachineBasicBlock &MBB : MF) - Modified |= optimizeBlock(MBB); - - return Modified; -} Property changes on: projects/clang370-import/contrib/llvm/lib/Target/R600/SILoadStoreOptimizer.cpp ___________________________________________________________________ Deleted: svn:eol-style ## -1 +0,0 ## -native \ No newline at end of property Deleted: svn:keywords ## -1 +0,0 ## -FreeBSD=%H \ No newline at end of property Deleted: svn:mime-type ## -1 +0,0 ## -text/plain \ No newline at end of property Index: projects/clang370-import/contrib/llvm/lib/Target/R600/AMDGPUIntrinsics.td =================================================================== --- projects/clang370-import/contrib/llvm/lib/Target/R600/AMDGPUIntrinsics.td (revision 288077) +++ projects/clang370-import/contrib/llvm/lib/Target/R600/AMDGPUIntrinsics.td (nonexistent) @@ -1,90 +0,0 @@ -//===-- AMDGPUIntrinsics.td - Common intrinsics -*- tablegen -*-----------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file defines intrinsics that are used by all hw codegen targets. -// -//===----------------------------------------------------------------------===// - -let TargetPrefix = "AMDGPU", isTarget = 1 in { - - def int_AMDGPU_store_output : Intrinsic<[], [llvm_float_ty, llvm_i32_ty], []>; - def int_AMDGPU_swizzle : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty], [IntrNoMem]>; - def int_AMDGPU_abs : Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>; - def int_AMDGPU_arl : Intrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem]>; - def int_AMDGPU_cndlt : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty], [IntrNoMem]>; - def int_AMDGPU_div : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>; - def int_AMDGPU_fract : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>; - def int_AMDGPU_clamp : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>; - - // This is named backwards (instead of rsq_legacy) so we don't have - // to define it with the public builtins intrinsics. This is a - // workaround for how intrinsic names are parsed. If the name is - // llvm.AMDGPU.rsq.legacy, the parser assumes that you meant - // llvm.AMDGPU.rsq.{f32 | f64} and incorrectly mangled the name. - def int_AMDGPU_legacy_rsq : Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>; - - def int_AMDGPU_dp4 : Intrinsic<[llvm_float_ty], [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem]>; - def int_AMDGPU_kill : Intrinsic<[], [llvm_float_ty], []>; - def int_AMDGPU_kilp : Intrinsic<[], [], []>; - def int_AMDGPU_lrp : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty], [IntrNoMem]>; - def int_AMDGPU_mul : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>; - def int_AMDGPU_pow : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>; - def int_AMDGPU_seq : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>; - def int_AMDGPU_sgt : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>; - def int_AMDGPU_sge : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>; - def int_AMDGPU_sle : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>; - def int_AMDGPU_sne : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>; - def int_AMDGPU_mullit : Intrinsic<[llvm_v4f32_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty], [IntrNoMem]>; - def int_AMDGPU_tex : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; - def int_AMDGPU_txb : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; - def int_AMDGPU_txf : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; - def int_AMDGPU_txq : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; - def int_AMDGPU_txd : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; - def int_AMDGPU_txl : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; - def int_AMDGPU_trunc : Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>; - def int_AMDGPU_ddx : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; - def int_AMDGPU_ddy : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; - def int_AMDGPU_imax : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; - def int_AMDGPU_imin : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; - def int_AMDGPU_umax : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; - def int_AMDGPU_umin : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; - def int_AMDGPU_umul24 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; - def int_AMDGPU_imul24 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; - def int_AMDGPU_imad24 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; - def int_AMDGPU_umad24 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; - def int_AMDGPU_cvt_f32_ubyte0 : Intrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrNoMem]>; - def int_AMDGPU_cvt_f32_ubyte1 : Intrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrNoMem]>; - def int_AMDGPU_cvt_f32_ubyte2 : Intrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrNoMem]>; - def int_AMDGPU_cvt_f32_ubyte3 : Intrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrNoMem]>; - def int_AMDGPU_cube : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty], [IntrNoMem]>; - def int_AMDGPU_bfi : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; - def int_AMDGPU_bfe_i32 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; - def int_AMDGPU_bfe_u32 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; - def int_AMDGPU_bfm : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; - def int_AMDGPU_brev : Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>; - def int_AMDGPU_flbit_i32 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>; - def int_AMDGPU_barrier_local : Intrinsic<[], [], []>; - def int_AMDGPU_barrier_global : Intrinsic<[], [], []>; -} - -// Legacy names for compatibility. -let TargetPrefix = "AMDIL", isTarget = 1 in { - def int_AMDIL_abs : Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>], [IntrNoMem]>; - def int_AMDIL_fraction : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>; - def int_AMDIL_clamp : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>; - def int_AMDIL_exp : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>; - def int_AMDIL_round_nearest : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>; -} - -let TargetPrefix = "TGSI", isTarget = 1 in { - - def int_TGSI_lit_z : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty],[IntrNoMem]>; -} - -include "SIIntrinsics.td" Index: projects/clang370-import/contrib/llvm/lib/Target/R600/SILowerControlFlow.cpp =================================================================== --- projects/clang370-import/contrib/llvm/lib/Target/R600/SILowerControlFlow.cpp (revision 288077) +++ projects/clang370-import/contrib/llvm/lib/Target/R600/SILowerControlFlow.cpp (nonexistent) @@ -1,605 +0,0 @@ -//===-- SILowerControlFlow.cpp - Use predicates for control flow ----------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -/// \file -/// \brief This pass lowers the pseudo control flow instructions to real -/// machine instructions. -/// -/// All control flow is handled using predicated instructions and -/// a predicate stack. Each Scalar ALU controls the operations of 64 Vector -/// ALUs. The Scalar ALU can update the predicate for any of the Vector ALUs -/// by writting to the 64-bit EXEC register (each bit corresponds to a -/// single vector ALU). Typically, for predicates, a vector ALU will write -/// to its bit of the VCC register (like EXEC VCC is 64-bits, one for each -/// Vector ALU) and then the ScalarALU will AND the VCC register with the -/// EXEC to update the predicates. -/// -/// For example: -/// %VCC = V_CMP_GT_F32 %VGPR1, %VGPR2 -/// %SGPR0 = SI_IF %VCC -/// %VGPR0 = V_ADD_F32 %VGPR0, %VGPR0 -/// %SGPR0 = SI_ELSE %SGPR0 -/// %VGPR0 = V_SUB_F32 %VGPR0, %VGPR0 -/// SI_END_CF %SGPR0 -/// -/// becomes: -/// -/// %SGPR0 = S_AND_SAVEEXEC_B64 %VCC // Save and update the exec mask -/// %SGPR0 = S_XOR_B64 %SGPR0, %EXEC // Clear live bits from saved exec mask -/// S_CBRANCH_EXECZ label0 // This instruction is an optional -/// // optimization which allows us to -/// // branch if all the bits of -/// // EXEC are zero. -/// %VGPR0 = V_ADD_F32 %VGPR0, %VGPR0 // Do the IF block of the branch -/// -/// label0: -/// %SGPR0 = S_OR_SAVEEXEC_B64 %EXEC // Restore the exec mask for the Then block -/// %EXEC = S_XOR_B64 %SGPR0, %EXEC // Clear live bits from saved exec mask -/// S_BRANCH_EXECZ label1 // Use our branch optimization -/// // instruction again. -/// %VGPR0 = V_SUB_F32 %VGPR0, %VGPR // Do the THEN block -/// label1: -/// %EXEC = S_OR_B64 %EXEC, %SGPR0 // Re-enable saved exec mask bits -//===----------------------------------------------------------------------===// - -#include "AMDGPU.h" -#include "AMDGPUSubtarget.h" -#include "SIInstrInfo.h" -#include "SIMachineFunctionInfo.h" -#include "llvm/CodeGen/MachineFrameInfo.h" -#include "llvm/CodeGen/MachineFunction.h" -#include "llvm/CodeGen/MachineFunctionPass.h" -#include "llvm/CodeGen/MachineInstrBuilder.h" -#include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/IR/Constants.h" - -using namespace llvm; - -namespace { - -class SILowerControlFlowPass : public MachineFunctionPass { - -private: - static const unsigned SkipThreshold = 12; - - static char ID; - const SIRegisterInfo *TRI; - const SIInstrInfo *TII; - - bool shouldSkip(MachineBasicBlock *From, MachineBasicBlock *To); - - void Skip(MachineInstr &From, MachineOperand &To); - void SkipIfDead(MachineInstr &MI); - - void If(MachineInstr &MI); - void Else(MachineInstr &MI); - void Break(MachineInstr &MI); - void IfBreak(MachineInstr &MI); - void ElseBreak(MachineInstr &MI); - void Loop(MachineInstr &MI); - void EndCf(MachineInstr &MI); - - void Kill(MachineInstr &MI); - void Branch(MachineInstr &MI); - - void LoadM0(MachineInstr &MI, MachineInstr *MovRel, int Offset = 0); - void computeIndirectRegAndOffset(unsigned VecReg, unsigned &Reg, int &Offset); - void IndirectSrc(MachineInstr &MI); - void IndirectDst(MachineInstr &MI); - -public: - SILowerControlFlowPass(TargetMachine &tm) : - MachineFunctionPass(ID), TRI(nullptr), TII(nullptr) { } - - bool runOnMachineFunction(MachineFunction &MF) override; - - const char *getPassName() const override { - return "SI Lower control flow instructions"; - } - -}; - -} // End anonymous namespace - -char SILowerControlFlowPass::ID = 0; - -FunctionPass *llvm::createSILowerControlFlowPass(TargetMachine &tm) { - return new SILowerControlFlowPass(tm); -} - -bool SILowerControlFlowPass::shouldSkip(MachineBasicBlock *From, - MachineBasicBlock *To) { - - unsigned NumInstr = 0; - - for (MachineBasicBlock *MBB = From; MBB != To && !MBB->succ_empty(); - MBB = *MBB->succ_begin()) { - - for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); - NumInstr < SkipThreshold && I != E; ++I) { - - if (I->isBundle() || !I->isBundled()) - if (++NumInstr >= SkipThreshold) - return true; - } - } - - return false; -} - -void SILowerControlFlowPass::Skip(MachineInstr &From, MachineOperand &To) { - - if (!shouldSkip(*From.getParent()->succ_begin(), To.getMBB())) - return; - - DebugLoc DL = From.getDebugLoc(); - BuildMI(*From.getParent(), &From, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ)) - .addOperand(To) - .addReg(AMDGPU::EXEC); -} - -void SILowerControlFlowPass::SkipIfDead(MachineInstr &MI) { - - MachineBasicBlock &MBB = *MI.getParent(); - DebugLoc DL = MI.getDebugLoc(); - - if (MBB.getParent()->getInfo()->getShaderType() != - ShaderType::PIXEL || - !shouldSkip(&MBB, &MBB.getParent()->back())) - return; - - MachineBasicBlock::iterator Insert = &MI; - ++Insert; - - // If the exec mask is non-zero, skip the next two instructions - BuildMI(MBB, Insert, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ)) - .addImm(3) - .addReg(AMDGPU::EXEC); - - // Exec mask is zero: Export to NULL target... - BuildMI(MBB, Insert, DL, TII->get(AMDGPU::EXP)) - .addImm(0) - .addImm(0x09) // V_008DFC_SQ_EXP_NULL - .addImm(0) - .addImm(1) - .addImm(1) - .addReg(AMDGPU::VGPR0) - .addReg(AMDGPU::VGPR0) - .addReg(AMDGPU::VGPR0) - .addReg(AMDGPU::VGPR0); - - // ... and terminate wavefront - BuildMI(MBB, Insert, DL, TII->get(AMDGPU::S_ENDPGM)); -} - -void SILowerControlFlowPass::If(MachineInstr &MI) { - MachineBasicBlock &MBB = *MI.getParent(); - DebugLoc DL = MI.getDebugLoc(); - unsigned Reg = MI.getOperand(0).getReg(); - unsigned Vcc = MI.getOperand(1).getReg(); - - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), Reg) - .addReg(Vcc); - - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), Reg) - .addReg(AMDGPU::EXEC) - .addReg(Reg); - - Skip(MI, MI.getOperand(2)); - - MI.eraseFromParent(); -} - -void SILowerControlFlowPass::Else(MachineInstr &MI) { - MachineBasicBlock &MBB = *MI.getParent(); - DebugLoc DL = MI.getDebugLoc(); - unsigned Dst = MI.getOperand(0).getReg(); - unsigned Src = MI.getOperand(1).getReg(); - - BuildMI(MBB, MBB.getFirstNonPHI(), DL, - TII->get(AMDGPU::S_OR_SAVEEXEC_B64), Dst) - .addReg(Src); // Saved EXEC - - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC) - .addReg(AMDGPU::EXEC) - .addReg(Dst); - - Skip(MI, MI.getOperand(2)); - - MI.eraseFromParent(); -} - -void SILowerControlFlowPass::Break(MachineInstr &MI) { - MachineBasicBlock &MBB = *MI.getParent(); - DebugLoc DL = MI.getDebugLoc(); - - unsigned Dst = MI.getOperand(0).getReg(); - unsigned Src = MI.getOperand(1).getReg(); - - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst) - .addReg(AMDGPU::EXEC) - .addReg(Src); - - MI.eraseFromParent(); -} - -void SILowerControlFlowPass::IfBreak(MachineInstr &MI) { - MachineBasicBlock &MBB = *MI.getParent(); - DebugLoc DL = MI.getDebugLoc(); - - unsigned Dst = MI.getOperand(0).getReg(); - unsigned Vcc = MI.getOperand(1).getReg(); - unsigned Src = MI.getOperand(2).getReg(); - - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst) - .addReg(Vcc) - .addReg(Src); - - MI.eraseFromParent(); -} - -void SILowerControlFlowPass::ElseBreak(MachineInstr &MI) { - MachineBasicBlock &MBB = *MI.getParent(); - DebugLoc DL = MI.getDebugLoc(); - - unsigned Dst = MI.getOperand(0).getReg(); - unsigned Saved = MI.getOperand(1).getReg(); - unsigned Src = MI.getOperand(2).getReg(); - - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst) - .addReg(Saved) - .addReg(Src); - - MI.eraseFromParent(); -} - -void SILowerControlFlowPass::Loop(MachineInstr &MI) { - MachineBasicBlock &MBB = *MI.getParent(); - DebugLoc DL = MI.getDebugLoc(); - unsigned Src = MI.getOperand(0).getReg(); - - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_ANDN2_B64), AMDGPU::EXEC) - .addReg(AMDGPU::EXEC) - .addReg(Src); - - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ)) - .addOperand(MI.getOperand(1)) - .addReg(AMDGPU::EXEC); - - MI.eraseFromParent(); -} - -void SILowerControlFlowPass::EndCf(MachineInstr &MI) { - MachineBasicBlock &MBB = *MI.getParent(); - DebugLoc DL = MI.getDebugLoc(); - unsigned Reg = MI.getOperand(0).getReg(); - - BuildMI(MBB, MBB.getFirstNonPHI(), DL, - TII->get(AMDGPU::S_OR_B64), AMDGPU::EXEC) - .addReg(AMDGPU::EXEC) - .addReg(Reg); - - MI.eraseFromParent(); -} - -void SILowerControlFlowPass::Branch(MachineInstr &MI) { - if (MI.getOperand(0).getMBB() == MI.getParent()->getNextNode()) - MI.eraseFromParent(); - - // If these aren't equal, this is probably an infinite loop. -} - -void SILowerControlFlowPass::Kill(MachineInstr &MI) { - MachineBasicBlock &MBB = *MI.getParent(); - DebugLoc DL = MI.getDebugLoc(); - const MachineOperand &Op = MI.getOperand(0); - -#ifndef NDEBUG - const SIMachineFunctionInfo *MFI - = MBB.getParent()->getInfo(); - // Kill is only allowed in pixel / geometry shaders. - assert(MFI->getShaderType() == ShaderType::PIXEL || - MFI->getShaderType() == ShaderType::GEOMETRY); -#endif - - // Clear this thread from the exec mask if the operand is negative - if ((Op.isImm())) { - // Constant operand: Set exec mask to 0 or do nothing - if (Op.getImm() & 0x80000000) { - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC) - .addImm(0); - } - } else { - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMPX_LE_F32_e32), AMDGPU::VCC) - .addImm(0) - .addOperand(Op); - } - - MI.eraseFromParent(); -} - -void SILowerControlFlowPass::LoadM0(MachineInstr &MI, MachineInstr *MovRel, int Offset) { - - MachineBasicBlock &MBB = *MI.getParent(); - DebugLoc DL = MI.getDebugLoc(); - MachineBasicBlock::iterator I = MI; - - unsigned Save = MI.getOperand(1).getReg(); - unsigned Idx = MI.getOperand(3).getReg(); - - if (AMDGPU::SReg_32RegClass.contains(Idx)) { - if (Offset) { - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0) - .addReg(Idx) - .addImm(Offset); - } else { - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0) - .addReg(Idx); - } - MBB.insert(I, MovRel); - } else { - - assert(AMDGPU::SReg_64RegClass.contains(Save)); - assert(AMDGPU::VGPR_32RegClass.contains(Idx)); - - // Save the EXEC mask - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), Save) - .addReg(AMDGPU::EXEC); - - // Read the next variant into VCC (lower 32 bits) <- also loop target - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), - AMDGPU::VCC_LO) - .addReg(Idx); - - // Move index from VCC into M0 - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0) - .addReg(AMDGPU::VCC_LO); - - // Compare the just read M0 value to all possible Idx values - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e32), AMDGPU::VCC) - .addReg(AMDGPU::M0) - .addReg(Idx); - - // Update EXEC, save the original EXEC value to VCC - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), AMDGPU::VCC) - .addReg(AMDGPU::VCC); - - if (Offset) { - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0) - .addReg(AMDGPU::M0) - .addImm(Offset); - } - // Do the actual move - MBB.insert(I, MovRel); - - // Update EXEC, switch all done bits to 0 and all todo bits to 1 - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC) - .addReg(AMDGPU::EXEC) - .addReg(AMDGPU::VCC); - - // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ)) - .addImm(-7) - .addReg(AMDGPU::EXEC); - - // Restore EXEC - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC) - .addReg(Save); - - } - MI.eraseFromParent(); -} - -/// \param @VecReg The register which holds element zero of the vector -/// being addressed into. -/// \param[out] @Reg The base register to use in the indirect addressing instruction. -/// \param[in,out] @Offset As an input, this is the constant offset part of the -// indirect Index. e.g. v0 = v[VecReg + Offset] -// As an output, this is a constant value that needs -// to be added to the value stored in M0. -void SILowerControlFlowPass::computeIndirectRegAndOffset(unsigned VecReg, - unsigned &Reg, - int &Offset) { - unsigned SubReg = TRI->getSubReg(VecReg, AMDGPU::sub0); - if (!SubReg) - SubReg = VecReg; - - const TargetRegisterClass *RC = TRI->getPhysRegClass(SubReg); - int RegIdx = TRI->getHWRegIndex(SubReg) + Offset; - - if (RegIdx < 0) { - Offset = RegIdx; - RegIdx = 0; - } else { - Offset = 0; - } - - Reg = RC->getRegister(RegIdx); -} - -void SILowerControlFlowPass::IndirectSrc(MachineInstr &MI) { - - MachineBasicBlock &MBB = *MI.getParent(); - DebugLoc DL = MI.getDebugLoc(); - - unsigned Dst = MI.getOperand(0).getReg(); - unsigned Vec = MI.getOperand(2).getReg(); - int Off = MI.getOperand(4).getImm(); - unsigned Reg; - - computeIndirectRegAndOffset(Vec, Reg, Off); - - MachineInstr *MovRel = - BuildMI(*MBB.getParent(), DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst) - .addReg(Reg) - .addReg(AMDGPU::M0, RegState::Implicit) - .addReg(Vec, RegState::Implicit); - - LoadM0(MI, MovRel, Off); -} - -void SILowerControlFlowPass::IndirectDst(MachineInstr &MI) { - - MachineBasicBlock &MBB = *MI.getParent(); - DebugLoc DL = MI.getDebugLoc(); - - unsigned Dst = MI.getOperand(0).getReg(); - int Off = MI.getOperand(4).getImm(); - unsigned Val = MI.getOperand(5).getReg(); - unsigned Reg; - - computeIndirectRegAndOffset(Dst, Reg, Off); - - MachineInstr *MovRel = - BuildMI(*MBB.getParent(), DL, TII->get(AMDGPU::V_MOVRELD_B32_e32)) - .addReg(Reg, RegState::Define) - .addReg(Val) - .addReg(AMDGPU::M0, RegState::Implicit) - .addReg(Dst, RegState::Implicit); - - LoadM0(MI, MovRel, Off); -} - -bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) { - TII = static_cast(MF.getSubtarget().getInstrInfo()); - TRI = - static_cast(MF.getSubtarget().getRegisterInfo()); - SIMachineFunctionInfo *MFI = MF.getInfo(); - - bool HaveKill = false; - bool NeedWQM = false; - bool NeedFlat = false; - unsigned Depth = 0; - - for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); - BI != BE; ++BI) { - - MachineBasicBlock &MBB = *BI; - MachineBasicBlock::iterator I, Next; - for (I = MBB.begin(); I != MBB.end(); I = Next) { - Next = std::next(I); - - MachineInstr &MI = *I; - if (TII->isWQM(MI.getOpcode()) || TII->isDS(MI.getOpcode())) - NeedWQM = true; - - // Flat uses m0 in case it needs to access LDS. - if (TII->isFLAT(MI.getOpcode())) - NeedFlat = true; - - switch (MI.getOpcode()) { - default: break; - case AMDGPU::SI_IF: - ++Depth; - If(MI); - break; - - case AMDGPU::SI_ELSE: - Else(MI); - break; - - case AMDGPU::SI_BREAK: - Break(MI); - break; - - case AMDGPU::SI_IF_BREAK: - IfBreak(MI); - break; - - case AMDGPU::SI_ELSE_BREAK: - ElseBreak(MI); - break; - - case AMDGPU::SI_LOOP: - ++Depth; - Loop(MI); - break; - - case AMDGPU::SI_END_CF: - if (--Depth == 0 && HaveKill) { - SkipIfDead(MI); - HaveKill = false; - } - EndCf(MI); - break; - - case AMDGPU::SI_KILL: - if (Depth == 0) - SkipIfDead(MI); - else - HaveKill = true; - Kill(MI); - break; - - case AMDGPU::S_BRANCH: - Branch(MI); - break; - - case AMDGPU::SI_INDIRECT_SRC: - IndirectSrc(MI); - break; - - case AMDGPU::SI_INDIRECT_DST_V1: - case AMDGPU::SI_INDIRECT_DST_V2: - case AMDGPU::SI_INDIRECT_DST_V4: - case AMDGPU::SI_INDIRECT_DST_V8: - case AMDGPU::SI_INDIRECT_DST_V16: - IndirectDst(MI); - break; - } - } - } - - if (NeedWQM && MFI->getShaderType() == ShaderType::PIXEL) { - MachineBasicBlock &MBB = MF.front(); - BuildMI(MBB, MBB.getFirstNonPHI(), DebugLoc(), TII->get(AMDGPU::S_WQM_B64), - AMDGPU::EXEC).addReg(AMDGPU::EXEC); - } - - // FIXME: This seems inappropriate to do here. - if (NeedFlat && MFI->IsKernel) { - // Insert the prologue initializing the SGPRs pointing to the scratch space - // for flat accesses. - const MachineFrameInfo *FrameInfo = MF.getFrameInfo(); - - // TODO: What to use with function calls? - - // FIXME: This is reporting stack size that is used in a scratch buffer - // rather than registers as well. - uint64_t StackSizeBytes = FrameInfo->getStackSize(); - - int IndirectBegin - = static_cast(TII)->getIndirectIndexBegin(MF); - // Convert register index to 256-byte unit. - uint64_t StackOffset = IndirectBegin < 0 ? 0 : (4 * IndirectBegin / 256); - - assert((StackSizeBytes < 0xffff) && StackOffset < 0xffff && - "Stack limits should be smaller than 16-bits"); - - // Initialize the flat scratch register pair. - // TODO: Can we use one s_mov_b64 here? - - // Offset is in units of 256-bytes. - MachineBasicBlock &MBB = MF.front(); - DebugLoc NoDL; - MachineBasicBlock::iterator Start = MBB.getFirstNonPHI(); - const MCInstrDesc &SMovK = TII->get(AMDGPU::S_MOVK_I32); - - assert(isInt<16>(StackOffset) && isInt<16>(StackSizeBytes)); - - BuildMI(MBB, Start, NoDL, SMovK, AMDGPU::FLAT_SCR_LO) - .addImm(StackOffset); - - // Documentation says size is "per-thread scratch size in bytes" - BuildMI(MBB, Start, NoDL, SMovK, AMDGPU::FLAT_SCR_HI) - .addImm(StackSizeBytes); - } - - return true; -} Index: projects/clang370-import/contrib/llvm/lib/Target/R600/AMDGPU.h =================================================================== --- projects/clang370-import/contrib/llvm/lib/Target/R600/AMDGPU.h (revision 288077) +++ projects/clang370-import/contrib/llvm/lib/Target/R600/AMDGPU.h (nonexistent) @@ -1,148 +0,0 @@ -//===-- AMDGPU.h - MachineFunction passes hw codegen --------------*- C++ -*-=// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -/// \file -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIB_TARGET_R600_AMDGPU_H -#define LLVM_LIB_TARGET_R600_AMDGPU_H - -#include "llvm/Support/TargetRegistry.h" -#include "llvm/Target/TargetMachine.h" - -namespace llvm { - -class AMDGPUInstrPrinter; -class AMDGPUSubtarget; -class AMDGPUTargetMachine; -class FunctionPass; -class MCAsmInfo; -class raw_ostream; -class Target; -class TargetMachine; - -// R600 Passes -FunctionPass *createR600VectorRegMerger(TargetMachine &tm); -FunctionPass *createR600TextureIntrinsicsReplacer(); -FunctionPass *createR600ExpandSpecialInstrsPass(TargetMachine &tm); -FunctionPass *createR600EmitClauseMarkers(); -FunctionPass *createR600ClauseMergePass(TargetMachine &tm); -FunctionPass *createR600Packetizer(TargetMachine &tm); -FunctionPass *createR600ControlFlowFinalizer(TargetMachine &tm); -FunctionPass *createAMDGPUCFGStructurizerPass(); - -// SI Passes -FunctionPass *createSITypeRewriter(); -FunctionPass *createSIAnnotateControlFlowPass(); -FunctionPass *createSIFoldOperandsPass(); -FunctionPass *createSILowerI1CopiesPass(); -FunctionPass *createSIShrinkInstructionsPass(); -FunctionPass *createSILoadStoreOptimizerPass(TargetMachine &tm); -FunctionPass *createSILowerControlFlowPass(TargetMachine &tm); -FunctionPass *createSIFixControlFlowLiveIntervalsPass(); -FunctionPass *createSIFixSGPRCopiesPass(TargetMachine &tm); -FunctionPass *createSIFixSGPRLiveRangesPass(); -FunctionPass *createSICodeEmitterPass(formatted_raw_ostream &OS); -FunctionPass *createSIInsertWaits(TargetMachine &tm); -FunctionPass *createSIPrepareScratchRegs(); - -void initializeSIFoldOperandsPass(PassRegistry &); -extern char &SIFoldOperandsID; - -void initializeSILowerI1CopiesPass(PassRegistry &); -extern char &SILowerI1CopiesID; - -void initializeSILoadStoreOptimizerPass(PassRegistry &); -extern char &SILoadStoreOptimizerID; - -// Passes common to R600 and SI -FunctionPass *createAMDGPUPromoteAlloca(const AMDGPUSubtarget &ST); -Pass *createAMDGPUStructurizeCFGPass(); -FunctionPass *createAMDGPUISelDag(TargetMachine &tm); -ModulePass *createAMDGPUAlwaysInlinePass(); - -void initializeSIFixControlFlowLiveIntervalsPass(PassRegistry&); -extern char &SIFixControlFlowLiveIntervalsID; - -void initializeSIFixSGPRLiveRangesPass(PassRegistry&); -extern char &SIFixSGPRLiveRangesID; - - -extern Target TheAMDGPUTarget; -extern Target TheGCNTarget; - -namespace AMDGPU { -enum TargetIndex { - TI_CONSTDATA_START, - TI_SCRATCH_RSRC_DWORD0, - TI_SCRATCH_RSRC_DWORD1, - TI_SCRATCH_RSRC_DWORD2, - TI_SCRATCH_RSRC_DWORD3 -}; -} - -#define END_OF_TEXT_LABEL_NAME "EndOfTextLabel" - -} // End namespace llvm - -namespace ShaderType { - enum Type { - PIXEL = 0, - VERTEX = 1, - GEOMETRY = 2, - COMPUTE = 3 - }; -} - -/// OpenCL uses address spaces to differentiate between -/// various memory regions on the hardware. On the CPU -/// all of the address spaces point to the same memory, -/// however on the GPU, each address space points to -/// a separate piece of memory that is unique from other -/// memory locations. -namespace AMDGPUAS { -enum AddressSpaces : unsigned { - PRIVATE_ADDRESS = 0, ///< Address space for private memory. - GLOBAL_ADDRESS = 1, ///< Address space for global memory (RAT0, VTX0). - CONSTANT_ADDRESS = 2, ///< Address space for constant memory - LOCAL_ADDRESS = 3, ///< Address space for local memory. - FLAT_ADDRESS = 4, ///< Address space for flat memory. - REGION_ADDRESS = 5, ///< Address space for region memory. - PARAM_D_ADDRESS = 6, ///< Address space for direct addressible parameter memory (CONST0) - PARAM_I_ADDRESS = 7, ///< Address space for indirect addressible parameter memory (VTX1) - - // Do not re-order the CONSTANT_BUFFER_* enums. Several places depend on this - // order to be able to dynamically index a constant buffer, for example: - // - // ConstantBufferAS = CONSTANT_BUFFER_0 + CBIdx - - CONSTANT_BUFFER_0 = 8, - CONSTANT_BUFFER_1 = 9, - CONSTANT_BUFFER_2 = 10, - CONSTANT_BUFFER_3 = 11, - CONSTANT_BUFFER_4 = 12, - CONSTANT_BUFFER_5 = 13, - CONSTANT_BUFFER_6 = 14, - CONSTANT_BUFFER_7 = 15, - CONSTANT_BUFFER_8 = 16, - CONSTANT_BUFFER_9 = 17, - CONSTANT_BUFFER_10 = 18, - CONSTANT_BUFFER_11 = 19, - CONSTANT_BUFFER_12 = 20, - CONSTANT_BUFFER_13 = 21, - CONSTANT_BUFFER_14 = 22, - CONSTANT_BUFFER_15 = 23, - ADDRESS_NONE = 24, ///< Address space for unknown memory. - LAST_ADDRESS = ADDRESS_NONE, - - // Some places use this if the address space can't be determined. - UNKNOWN_ADDRESS_SPACE = ~0u -}; - -} // namespace AMDGPUAS - -#endif Index: projects/clang370-import/contrib/llvm/lib/Target/R600/SIInstrInfo.cpp =================================================================== --- projects/clang370-import/contrib/llvm/lib/Target/R600/SIInstrInfo.cpp (revision 288077) +++ projects/clang370-import/contrib/llvm/lib/Target/R600/SIInstrInfo.cpp (nonexistent) @@ -1,2723 +0,0 @@ -//===-- SIInstrInfo.cpp - SI Instruction Information ---------------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -/// \file -/// \brief SI Implementation of TargetInstrInfo. -// -//===----------------------------------------------------------------------===// - - -#include "SIInstrInfo.h" -#include "AMDGPUTargetMachine.h" -#include "SIDefines.h" -#include "SIMachineFunctionInfo.h" -#include "llvm/CodeGen/MachineFrameInfo.h" -#include "llvm/CodeGen/MachineInstrBuilder.h" -#include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/IR/Function.h" -#include "llvm/CodeGen/RegisterScavenging.h" -#include "llvm/MC/MCInstrDesc.h" -#include "llvm/Support/Debug.h" - -using namespace llvm; - -SIInstrInfo::SIInstrInfo(const AMDGPUSubtarget &st) - : AMDGPUInstrInfo(st), RI() {} - -//===----------------------------------------------------------------------===// -// TargetInstrInfo callbacks -//===----------------------------------------------------------------------===// - -static unsigned getNumOperandsNoGlue(SDNode *Node) { - unsigned N = Node->getNumOperands(); - while (N && Node->getOperand(N - 1).getValueType() == MVT::Glue) - --N; - return N; -} - -static SDValue findChainOperand(SDNode *Load) { - SDValue LastOp = Load->getOperand(getNumOperandsNoGlue(Load) - 1); - assert(LastOp.getValueType() == MVT::Other && "Chain missing from load node"); - return LastOp; -} - -/// \brief Returns true if both nodes have the same value for the given -/// operand \p Op, or if both nodes do not have this operand. -static bool nodesHaveSameOperandValue(SDNode *N0, SDNode* N1, unsigned OpName) { - unsigned Opc0 = N0->getMachineOpcode(); - unsigned Opc1 = N1->getMachineOpcode(); - - int Op0Idx = AMDGPU::getNamedOperandIdx(Opc0, OpName); - int Op1Idx = AMDGPU::getNamedOperandIdx(Opc1, OpName); - - if (Op0Idx == -1 && Op1Idx == -1) - return true; - - - if ((Op0Idx == -1 && Op1Idx != -1) || - (Op1Idx == -1 && Op0Idx != -1)) - return false; - - // getNamedOperandIdx returns the index for the MachineInstr's operands, - // which includes the result as the first operand. We are indexing into the - // MachineSDNode's operands, so we need to skip the result operand to get - // the real index. - --Op0Idx; - --Op1Idx; - - return N0->getOperand(Op0Idx) == N1->getOperand(Op1Idx); -} - -bool SIInstrInfo::isReallyTriviallyReMaterializable(const MachineInstr *MI, - AliasAnalysis *AA) const { - // TODO: The generic check fails for VALU instructions that should be - // rematerializable due to implicit reads of exec. We really want all of the - // generic logic for this except for this. - switch (MI->getOpcode()) { - case AMDGPU::V_MOV_B32_e32: - case AMDGPU::V_MOV_B32_e64: - return true; - default: - return false; - } -} - -bool SIInstrInfo::areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1, - int64_t &Offset0, - int64_t &Offset1) const { - if (!Load0->isMachineOpcode() || !Load1->isMachineOpcode()) - return false; - - unsigned Opc0 = Load0->getMachineOpcode(); - unsigned Opc1 = Load1->getMachineOpcode(); - - // Make sure both are actually loads. - if (!get(Opc0).mayLoad() || !get(Opc1).mayLoad()) - return false; - - if (isDS(Opc0) && isDS(Opc1)) { - - // FIXME: Handle this case: - if (getNumOperandsNoGlue(Load0) != getNumOperandsNoGlue(Load1)) - return false; - - // Check base reg. - if (Load0->getOperand(1) != Load1->getOperand(1)) - return false; - - // Check chain. - if (findChainOperand(Load0) != findChainOperand(Load1)) - return false; - - // Skip read2 / write2 variants for simplicity. - // TODO: We should report true if the used offsets are adjacent (excluded - // st64 versions). - if (AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::data1) != -1 || - AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::data1) != -1) - return false; - - Offset0 = cast(Load0->getOperand(2))->getZExtValue(); - Offset1 = cast(Load1->getOperand(2))->getZExtValue(); - return true; - } - - if (isSMRD(Opc0) && isSMRD(Opc1)) { - assert(getNumOperandsNoGlue(Load0) == getNumOperandsNoGlue(Load1)); - - // Check base reg. - if (Load0->getOperand(0) != Load1->getOperand(0)) - return false; - - const ConstantSDNode *Load0Offset = - dyn_cast(Load0->getOperand(1)); - const ConstantSDNode *Load1Offset = - dyn_cast(Load1->getOperand(1)); - - if (!Load0Offset || !Load1Offset) - return false; - - // Check chain. - if (findChainOperand(Load0) != findChainOperand(Load1)) - return false; - - Offset0 = Load0Offset->getZExtValue(); - Offset1 = Load1Offset->getZExtValue(); - return true; - } - - // MUBUF and MTBUF can access the same addresses. - if ((isMUBUF(Opc0) || isMTBUF(Opc0)) && (isMUBUF(Opc1) || isMTBUF(Opc1))) { - - // MUBUF and MTBUF have vaddr at different indices. - if (!nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::soffset) || - findChainOperand(Load0) != findChainOperand(Load1) || - !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::vaddr) || - !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::srsrc)) - return false; - - int OffIdx0 = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset); - int OffIdx1 = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset); - - if (OffIdx0 == -1 || OffIdx1 == -1) - return false; - - // getNamedOperandIdx returns the index for MachineInstrs. Since they - // inlcude the output in the operand list, but SDNodes don't, we need to - // subtract the index by one. - --OffIdx0; - --OffIdx1; - - SDValue Off0 = Load0->getOperand(OffIdx0); - SDValue Off1 = Load1->getOperand(OffIdx1); - - // The offset might be a FrameIndexSDNode. - if (!isa(Off0) || !isa(Off1)) - return false; - - Offset0 = cast(Off0)->getZExtValue(); - Offset1 = cast(Off1)->getZExtValue(); - return true; - } - - return false; -} - -static bool isStride64(unsigned Opc) { - switch (Opc) { - case AMDGPU::DS_READ2ST64_B32: - case AMDGPU::DS_READ2ST64_B64: - case AMDGPU::DS_WRITE2ST64_B32: - case AMDGPU::DS_WRITE2ST64_B64: - return true; - default: - return false; - } -} - -bool SIInstrInfo::getLdStBaseRegImmOfs(MachineInstr *LdSt, - unsigned &BaseReg, unsigned &Offset, - const TargetRegisterInfo *TRI) const { - unsigned Opc = LdSt->getOpcode(); - if (isDS(Opc)) { - const MachineOperand *OffsetImm = getNamedOperand(*LdSt, - AMDGPU::OpName::offset); - if (OffsetImm) { - // Normal, single offset LDS instruction. - const MachineOperand *AddrReg = getNamedOperand(*LdSt, - AMDGPU::OpName::addr); - - BaseReg = AddrReg->getReg(); - Offset = OffsetImm->getImm(); - return true; - } - - // The 2 offset instructions use offset0 and offset1 instead. We can treat - // these as a load with a single offset if the 2 offsets are consecutive. We - // will use this for some partially aligned loads. - const MachineOperand *Offset0Imm = getNamedOperand(*LdSt, - AMDGPU::OpName::offset0); - const MachineOperand *Offset1Imm = getNamedOperand(*LdSt, - AMDGPU::OpName::offset1); - - uint8_t Offset0 = Offset0Imm->getImm(); - uint8_t Offset1 = Offset1Imm->getImm(); - assert(Offset1 > Offset0); - - if (Offset1 - Offset0 == 1) { - // Each of these offsets is in element sized units, so we need to convert - // to bytes of the individual reads. - - unsigned EltSize; - if (LdSt->mayLoad()) - EltSize = getOpRegClass(*LdSt, 0)->getSize() / 2; - else { - assert(LdSt->mayStore()); - int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0); - EltSize = getOpRegClass(*LdSt, Data0Idx)->getSize(); - } - - if (isStride64(Opc)) - EltSize *= 64; - - const MachineOperand *AddrReg = getNamedOperand(*LdSt, - AMDGPU::OpName::addr); - BaseReg = AddrReg->getReg(); - Offset = EltSize * Offset0; - return true; - } - - return false; - } - - if (isMUBUF(Opc) || isMTBUF(Opc)) { - if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::soffset) != -1) - return false; - - const MachineOperand *AddrReg = getNamedOperand(*LdSt, - AMDGPU::OpName::vaddr); - if (!AddrReg) - return false; - - const MachineOperand *OffsetImm = getNamedOperand(*LdSt, - AMDGPU::OpName::offset); - BaseReg = AddrReg->getReg(); - Offset = OffsetImm->getImm(); - return true; - } - - if (isSMRD(Opc)) { - const MachineOperand *OffsetImm = getNamedOperand(*LdSt, - AMDGPU::OpName::offset); - if (!OffsetImm) - return false; - - const MachineOperand *SBaseReg = getNamedOperand(*LdSt, - AMDGPU::OpName::sbase); - BaseReg = SBaseReg->getReg(); - Offset = OffsetImm->getImm(); - return true; - } - - return false; -} - -bool SIInstrInfo::shouldClusterLoads(MachineInstr *FirstLdSt, - MachineInstr *SecondLdSt, - unsigned NumLoads) const { - unsigned Opc0 = FirstLdSt->getOpcode(); - unsigned Opc1 = SecondLdSt->getOpcode(); - - // TODO: This needs finer tuning - if (NumLoads > 4) - return false; - - if (isDS(Opc0) && isDS(Opc1)) - return true; - - if (isSMRD(Opc0) && isSMRD(Opc1)) - return true; - - if ((isMUBUF(Opc0) || isMTBUF(Opc0)) && (isMUBUF(Opc1) || isMTBUF(Opc1))) - return true; - - return false; -} - -void -SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MI, DebugLoc DL, - unsigned DestReg, unsigned SrcReg, - bool KillSrc) const { - - // If we are trying to copy to or from SCC, there is a bug somewhere else in - // the backend. While it may be theoretically possible to do this, it should - // never be necessary. - assert(DestReg != AMDGPU::SCC && SrcReg != AMDGPU::SCC); - - static const int16_t Sub0_15[] = { - AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, - AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, - AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11, - AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15, 0 - }; - - static const int16_t Sub0_7[] = { - AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, - AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, 0 - }; - - static const int16_t Sub0_3[] = { - AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, 0 - }; - - static const int16_t Sub0_2[] = { - AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, 0 - }; - - static const int16_t Sub0_1[] = { - AMDGPU::sub0, AMDGPU::sub1, 0 - }; - - unsigned Opcode; - const int16_t *SubIndices; - - if (AMDGPU::SReg_32RegClass.contains(DestReg)) { - assert(AMDGPU::SReg_32RegClass.contains(SrcReg)); - BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg) - .addReg(SrcReg, getKillRegState(KillSrc)); - return; - - } else if (AMDGPU::SReg_64RegClass.contains(DestReg)) { - if (DestReg == AMDGPU::VCC) { - if (AMDGPU::SReg_64RegClass.contains(SrcReg)) { - BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), AMDGPU::VCC) - .addReg(SrcReg, getKillRegState(KillSrc)); - } else { - // FIXME: Hack until VReg_1 removed. - assert(AMDGPU::VGPR_32RegClass.contains(SrcReg)); - BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_I32_e32), AMDGPU::VCC) - .addImm(0) - .addReg(SrcReg, getKillRegState(KillSrc)); - } - - return; - } - - assert(AMDGPU::SReg_64RegClass.contains(SrcReg)); - BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg) - .addReg(SrcReg, getKillRegState(KillSrc)); - return; - - } else if (AMDGPU::SReg_128RegClass.contains(DestReg)) { - assert(AMDGPU::SReg_128RegClass.contains(SrcReg)); - Opcode = AMDGPU::S_MOV_B32; - SubIndices = Sub0_3; - - } else if (AMDGPU::SReg_256RegClass.contains(DestReg)) { - assert(AMDGPU::SReg_256RegClass.contains(SrcReg)); - Opcode = AMDGPU::S_MOV_B32; - SubIndices = Sub0_7; - - } else if (AMDGPU::SReg_512RegClass.contains(DestReg)) { - assert(AMDGPU::SReg_512RegClass.contains(SrcReg)); - Opcode = AMDGPU::S_MOV_B32; - SubIndices = Sub0_15; - - } else if (AMDGPU::VGPR_32RegClass.contains(DestReg)) { - assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) || - AMDGPU::SReg_32RegClass.contains(SrcReg)); - BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg) - .addReg(SrcReg, getKillRegState(KillSrc)); - return; - - } else if (AMDGPU::VReg_64RegClass.contains(DestReg)) { - assert(AMDGPU::VReg_64RegClass.contains(SrcReg) || - AMDGPU::SReg_64RegClass.contains(SrcReg)); - Opcode = AMDGPU::V_MOV_B32_e32; - SubIndices = Sub0_1; - - } else if (AMDGPU::VReg_96RegClass.contains(DestReg)) { - assert(AMDGPU::VReg_96RegClass.contains(SrcReg)); - Opcode = AMDGPU::V_MOV_B32_e32; - SubIndices = Sub0_2; - - } else if (AMDGPU::VReg_128RegClass.contains(DestReg)) { - assert(AMDGPU::VReg_128RegClass.contains(SrcReg) || - AMDGPU::SReg_128RegClass.contains(SrcReg)); - Opcode = AMDGPU::V_MOV_B32_e32; - SubIndices = Sub0_3; - - } else if (AMDGPU::VReg_256RegClass.contains(DestReg)) { - assert(AMDGPU::VReg_256RegClass.contains(SrcReg) || - AMDGPU::SReg_256RegClass.contains(SrcReg)); - Opcode = AMDGPU::V_MOV_B32_e32; - SubIndices = Sub0_7; - - } else if (AMDGPU::VReg_512RegClass.contains(DestReg)) { - assert(AMDGPU::VReg_512RegClass.contains(SrcReg) || - AMDGPU::SReg_512RegClass.contains(SrcReg)); - Opcode = AMDGPU::V_MOV_B32_e32; - SubIndices = Sub0_15; - - } else { - llvm_unreachable("Can't copy register!"); - } - - while (unsigned SubIdx = *SubIndices++) { - MachineInstrBuilder Builder = BuildMI(MBB, MI, DL, - get(Opcode), RI.getSubReg(DestReg, SubIdx)); - - Builder.addReg(RI.getSubReg(SrcReg, SubIdx), getKillRegState(KillSrc)); - - if (*SubIndices) - Builder.addReg(DestReg, RegState::Define | RegState::Implicit); - } -} - -unsigned SIInstrInfo::commuteOpcode(const MachineInstr &MI) const { - const unsigned Opcode = MI.getOpcode(); - - int NewOpc; - - // Try to map original to commuted opcode - NewOpc = AMDGPU::getCommuteRev(Opcode); - // Check if the commuted (REV) opcode exists on the target. - if (NewOpc != -1 && pseudoToMCOpcode(NewOpc) != -1) - return NewOpc; - - // Try to map commuted to original opcode - NewOpc = AMDGPU::getCommuteOrig(Opcode); - // Check if the original (non-REV) opcode exists on the target. - if (NewOpc != -1 && pseudoToMCOpcode(NewOpc) != -1) - return NewOpc; - - return Opcode; -} - -unsigned SIInstrInfo::getMovOpcode(const TargetRegisterClass *DstRC) const { - - if (DstRC->getSize() == 4) { - return RI.isSGPRClass(DstRC) ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32; - } else if (DstRC->getSize() == 8 && RI.isSGPRClass(DstRC)) { - return AMDGPU::S_MOV_B64; - } else if (DstRC->getSize() == 8 && !RI.isSGPRClass(DstRC)) { - return AMDGPU::V_MOV_B64_PSEUDO; - } - return AMDGPU::COPY; -} - -void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MI, - unsigned SrcReg, bool isKill, - int FrameIndex, - const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI) const { - MachineFunction *MF = MBB.getParent(); - SIMachineFunctionInfo *MFI = MF->getInfo(); - MachineFrameInfo *FrameInfo = MF->getFrameInfo(); - DebugLoc DL = MBB.findDebugLoc(MI); - int Opcode = -1; - - if (RI.isSGPRClass(RC)) { - // We are only allowed to create one new instruction when spilling - // registers, so we need to use pseudo instruction for spilling - // SGPRs. - switch (RC->getSize() * 8) { - case 32: Opcode = AMDGPU::SI_SPILL_S32_SAVE; break; - case 64: Opcode = AMDGPU::SI_SPILL_S64_SAVE; break; - case 128: Opcode = AMDGPU::SI_SPILL_S128_SAVE; break; - case 256: Opcode = AMDGPU::SI_SPILL_S256_SAVE; break; - case 512: Opcode = AMDGPU::SI_SPILL_S512_SAVE; break; - } - } else if(RI.hasVGPRs(RC) && ST.isVGPRSpillingEnabled(MFI)) { - MFI->setHasSpilledVGPRs(); - - switch(RC->getSize() * 8) { - case 32: Opcode = AMDGPU::SI_SPILL_V32_SAVE; break; - case 64: Opcode = AMDGPU::SI_SPILL_V64_SAVE; break; - case 96: Opcode = AMDGPU::SI_SPILL_V96_SAVE; break; - case 128: Opcode = AMDGPU::SI_SPILL_V128_SAVE; break; - case 256: Opcode = AMDGPU::SI_SPILL_V256_SAVE; break; - case 512: Opcode = AMDGPU::SI_SPILL_V512_SAVE; break; - } - } - - if (Opcode != -1) { - FrameInfo->setObjectAlignment(FrameIndex, 4); - BuildMI(MBB, MI, DL, get(Opcode)) - .addReg(SrcReg) - .addFrameIndex(FrameIndex) - // Place-holder registers, these will be filled in by - // SIPrepareScratchRegs. - .addReg(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, RegState::Undef) - .addReg(AMDGPU::SGPR0, RegState::Undef); - } else { - LLVMContext &Ctx = MF->getFunction()->getContext(); - Ctx.emitError("SIInstrInfo::storeRegToStackSlot - Do not know how to" - " spill register"); - BuildMI(MBB, MI, DL, get(AMDGPU::KILL)) - .addReg(SrcReg); - } -} - -void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MI, - unsigned DestReg, int FrameIndex, - const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI) const { - MachineFunction *MF = MBB.getParent(); - const SIMachineFunctionInfo *MFI = MF->getInfo(); - MachineFrameInfo *FrameInfo = MF->getFrameInfo(); - DebugLoc DL = MBB.findDebugLoc(MI); - int Opcode = -1; - - if (RI.isSGPRClass(RC)){ - switch(RC->getSize() * 8) { - case 32: Opcode = AMDGPU::SI_SPILL_S32_RESTORE; break; - case 64: Opcode = AMDGPU::SI_SPILL_S64_RESTORE; break; - case 128: Opcode = AMDGPU::SI_SPILL_S128_RESTORE; break; - case 256: Opcode = AMDGPU::SI_SPILL_S256_RESTORE; break; - case 512: Opcode = AMDGPU::SI_SPILL_S512_RESTORE; break; - } - } else if(RI.hasVGPRs(RC) && ST.isVGPRSpillingEnabled(MFI)) { - switch(RC->getSize() * 8) { - case 32: Opcode = AMDGPU::SI_SPILL_V32_RESTORE; break; - case 64: Opcode = AMDGPU::SI_SPILL_V64_RESTORE; break; - case 96: Opcode = AMDGPU::SI_SPILL_V96_RESTORE; break; - case 128: Opcode = AMDGPU::SI_SPILL_V128_RESTORE; break; - case 256: Opcode = AMDGPU::SI_SPILL_V256_RESTORE; break; - case 512: Opcode = AMDGPU::SI_SPILL_V512_RESTORE; break; - } - } - - if (Opcode != -1) { - FrameInfo->setObjectAlignment(FrameIndex, 4); - BuildMI(MBB, MI, DL, get(Opcode), DestReg) - .addFrameIndex(FrameIndex) - // Place-holder registers, these will be filled in by - // SIPrepareScratchRegs. - .addReg(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, RegState::Undef) - .addReg(AMDGPU::SGPR0, RegState::Undef); - - } else { - LLVMContext &Ctx = MF->getFunction()->getContext(); - Ctx.emitError("SIInstrInfo::loadRegFromStackSlot - Do not know how to" - " restore register"); - BuildMI(MBB, MI, DL, get(AMDGPU::IMPLICIT_DEF), DestReg); - } -} - -/// \param @Offset Offset in bytes of the FrameIndex being spilled -unsigned SIInstrInfo::calculateLDSSpillAddress(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MI, - RegScavenger *RS, unsigned TmpReg, - unsigned FrameOffset, - unsigned Size) const { - MachineFunction *MF = MBB.getParent(); - SIMachineFunctionInfo *MFI = MF->getInfo(); - const AMDGPUSubtarget &ST = MF->getSubtarget(); - const SIRegisterInfo *TRI = - static_cast(ST.getRegisterInfo()); - DebugLoc DL = MBB.findDebugLoc(MI); - unsigned WorkGroupSize = MFI->getMaximumWorkGroupSize(*MF); - unsigned WavefrontSize = ST.getWavefrontSize(); - - unsigned TIDReg = MFI->getTIDReg(); - if (!MFI->hasCalculatedTID()) { - MachineBasicBlock &Entry = MBB.getParent()->front(); - MachineBasicBlock::iterator Insert = Entry.front(); - DebugLoc DL = Insert->getDebugLoc(); - - TIDReg = RI.findUnusedRegister(MF->getRegInfo(), &AMDGPU::VGPR_32RegClass); - if (TIDReg == AMDGPU::NoRegister) - return TIDReg; - - - if (MFI->getShaderType() == ShaderType::COMPUTE && - WorkGroupSize > WavefrontSize) { - - unsigned TIDIGXReg = TRI->getPreloadedValue(*MF, SIRegisterInfo::TIDIG_X); - unsigned TIDIGYReg = TRI->getPreloadedValue(*MF, SIRegisterInfo::TIDIG_Y); - unsigned TIDIGZReg = TRI->getPreloadedValue(*MF, SIRegisterInfo::TIDIG_Z); - unsigned InputPtrReg = - TRI->getPreloadedValue(*MF, SIRegisterInfo::INPUT_PTR); - for (unsigned Reg : {TIDIGXReg, TIDIGYReg, TIDIGZReg}) { - if (!Entry.isLiveIn(Reg)) - Entry.addLiveIn(Reg); - } - - RS->enterBasicBlock(&Entry); - unsigned STmp0 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0); - unsigned STmp1 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0); - BuildMI(Entry, Insert, DL, get(AMDGPU::S_LOAD_DWORD_IMM), STmp0) - .addReg(InputPtrReg) - .addImm(SI::KernelInputOffsets::NGROUPS_Z); - BuildMI(Entry, Insert, DL, get(AMDGPU::S_LOAD_DWORD_IMM), STmp1) - .addReg(InputPtrReg) - .addImm(SI::KernelInputOffsets::NGROUPS_Y); - - // NGROUPS.X * NGROUPS.Y - BuildMI(Entry, Insert, DL, get(AMDGPU::S_MUL_I32), STmp1) - .addReg(STmp1) - .addReg(STmp0); - // (NGROUPS.X * NGROUPS.Y) * TIDIG.X - BuildMI(Entry, Insert, DL, get(AMDGPU::V_MUL_U32_U24_e32), TIDReg) - .addReg(STmp1) - .addReg(TIDIGXReg); - // NGROUPS.Z * TIDIG.Y + (NGROUPS.X * NGROPUS.Y * TIDIG.X) - BuildMI(Entry, Insert, DL, get(AMDGPU::V_MAD_U32_U24), TIDReg) - .addReg(STmp0) - .addReg(TIDIGYReg) - .addReg(TIDReg); - // (NGROUPS.Z * TIDIG.Y + (NGROUPS.X * NGROPUS.Y * TIDIG.X)) + TIDIG.Z - BuildMI(Entry, Insert, DL, get(AMDGPU::V_ADD_I32_e32), TIDReg) - .addReg(TIDReg) - .addReg(TIDIGZReg); - } else { - // Get the wave id - BuildMI(Entry, Insert, DL, get(AMDGPU::V_MBCNT_LO_U32_B32_e64), - TIDReg) - .addImm(-1) - .addImm(0); - - BuildMI(Entry, Insert, DL, get(AMDGPU::V_MBCNT_HI_U32_B32_e64), - TIDReg) - .addImm(-1) - .addReg(TIDReg); - } - - BuildMI(Entry, Insert, DL, get(AMDGPU::V_LSHLREV_B32_e32), - TIDReg) - .addImm(2) - .addReg(TIDReg); - MFI->setTIDReg(TIDReg); - } - - // Add FrameIndex to LDS offset - unsigned LDSOffset = MFI->LDSSize + (FrameOffset * WorkGroupSize); - BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_I32_e32), TmpReg) - .addImm(LDSOffset) - .addReg(TIDReg); - - return TmpReg; -} - -void SIInstrInfo::insertNOPs(MachineBasicBlock::iterator MI, - int Count) const { - while (Count > 0) { - int Arg; - if (Count >= 8) - Arg = 7; - else - Arg = Count - 1; - Count -= 8; - BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), get(AMDGPU::S_NOP)) - .addImm(Arg); - } -} - -bool SIInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const { - MachineBasicBlock &MBB = *MI->getParent(); - DebugLoc DL = MBB.findDebugLoc(MI); - switch (MI->getOpcode()) { - default: return AMDGPUInstrInfo::expandPostRAPseudo(MI); - - case AMDGPU::SI_CONSTDATA_PTR: { - unsigned Reg = MI->getOperand(0).getReg(); - unsigned RegLo = RI.getSubReg(Reg, AMDGPU::sub0); - unsigned RegHi = RI.getSubReg(Reg, AMDGPU::sub1); - - BuildMI(MBB, MI, DL, get(AMDGPU::S_GETPC_B64), Reg); - - // Add 32-bit offset from this instruction to the start of the constant data. - BuildMI(MBB, MI, DL, get(AMDGPU::S_ADD_U32), RegLo) - .addReg(RegLo) - .addTargetIndex(AMDGPU::TI_CONSTDATA_START) - .addReg(AMDGPU::SCC, RegState::Define | RegState::Implicit); - BuildMI(MBB, MI, DL, get(AMDGPU::S_ADDC_U32), RegHi) - .addReg(RegHi) - .addImm(0) - .addReg(AMDGPU::SCC, RegState::Define | RegState::Implicit) - .addReg(AMDGPU::SCC, RegState::Implicit); - MI->eraseFromParent(); - break; - } - case AMDGPU::SGPR_USE: - // This is just a placeholder for register allocation. - MI->eraseFromParent(); - break; - - case AMDGPU::V_MOV_B64_PSEUDO: { - unsigned Dst = MI->getOperand(0).getReg(); - unsigned DstLo = RI.getSubReg(Dst, AMDGPU::sub0); - unsigned DstHi = RI.getSubReg(Dst, AMDGPU::sub1); - - const MachineOperand &SrcOp = MI->getOperand(1); - // FIXME: Will this work for 64-bit floating point immediates? - assert(!SrcOp.isFPImm()); - if (SrcOp.isImm()) { - APInt Imm(64, SrcOp.getImm()); - BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo) - .addImm(Imm.getLoBits(32).getZExtValue()) - .addReg(Dst, RegState::Implicit); - BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi) - .addImm(Imm.getHiBits(32).getZExtValue()) - .addReg(Dst, RegState::Implicit); - } else { - assert(SrcOp.isReg()); - BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo) - .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub0)) - .addReg(Dst, RegState::Implicit); - BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi) - .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub1)) - .addReg(Dst, RegState::Implicit); - } - MI->eraseFromParent(); - break; - } - - case AMDGPU::V_CNDMASK_B64_PSEUDO: { - unsigned Dst = MI->getOperand(0).getReg(); - unsigned DstLo = RI.getSubReg(Dst, AMDGPU::sub0); - unsigned DstHi = RI.getSubReg(Dst, AMDGPU::sub1); - unsigned Src0 = MI->getOperand(1).getReg(); - unsigned Src1 = MI->getOperand(2).getReg(); - const MachineOperand &SrcCond = MI->getOperand(3); - - BuildMI(MBB, MI, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstLo) - .addReg(RI.getSubReg(Src0, AMDGPU::sub0)) - .addReg(RI.getSubReg(Src1, AMDGPU::sub0)) - .addOperand(SrcCond); - BuildMI(MBB, MI, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstHi) - .addReg(RI.getSubReg(Src0, AMDGPU::sub1)) - .addReg(RI.getSubReg(Src1, AMDGPU::sub1)) - .addOperand(SrcCond); - MI->eraseFromParent(); - break; - } - } - return true; -} - -MachineInstr *SIInstrInfo::commuteInstruction(MachineInstr *MI, - bool NewMI) const { - - if (MI->getNumOperands() < 3) - return nullptr; - - int Src0Idx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), - AMDGPU::OpName::src0); - assert(Src0Idx != -1 && "Should always have src0 operand"); - - MachineOperand &Src0 = MI->getOperand(Src0Idx); - if (!Src0.isReg()) - return nullptr; - - int Src1Idx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), - AMDGPU::OpName::src1); - if (Src1Idx == -1) - return nullptr; - - MachineOperand &Src1 = MI->getOperand(Src1Idx); - - // Make sure it's legal to commute operands for VOP2. - if (isVOP2(MI->getOpcode()) && - (!isOperandLegal(MI, Src0Idx, &Src1) || - !isOperandLegal(MI, Src1Idx, &Src0))) { - return nullptr; - } - - if (!Src1.isReg()) { - // Allow commuting instructions with Imm operands. - if (NewMI || !Src1.isImm() || - (!isVOP2(MI->getOpcode()) && !isVOP3(MI->getOpcode()))) { - return nullptr; - } - - // Be sure to copy the source modifiers to the right place. - if (MachineOperand *Src0Mods - = getNamedOperand(*MI, AMDGPU::OpName::src0_modifiers)) { - MachineOperand *Src1Mods - = getNamedOperand(*MI, AMDGPU::OpName::src1_modifiers); - - int Src0ModsVal = Src0Mods->getImm(); - if (!Src1Mods && Src0ModsVal != 0) - return nullptr; - - // XXX - This assert might be a lie. It might be useful to have a neg - // modifier with 0.0. - int Src1ModsVal = Src1Mods->getImm(); - assert((Src1ModsVal == 0) && "Not expecting modifiers with immediates"); - - Src1Mods->setImm(Src0ModsVal); - Src0Mods->setImm(Src1ModsVal); - } - - unsigned Reg = Src0.getReg(); - unsigned SubReg = Src0.getSubReg(); - if (Src1.isImm()) - Src0.ChangeToImmediate(Src1.getImm()); - else - llvm_unreachable("Should only have immediates"); - - Src1.ChangeToRegister(Reg, false); - Src1.setSubReg(SubReg); - } else { - MI = TargetInstrInfo::commuteInstruction(MI, NewMI); - } - - if (MI) - MI->setDesc(get(commuteOpcode(*MI))); - - return MI; -} - -// This needs to be implemented because the source modifiers may be inserted -// between the true commutable operands, and the base -// TargetInstrInfo::commuteInstruction uses it. -bool SIInstrInfo::findCommutedOpIndices(MachineInstr *MI, - unsigned &SrcOpIdx1, - unsigned &SrcOpIdx2) const { - const MCInstrDesc &MCID = MI->getDesc(); - if (!MCID.isCommutable()) - return false; - - unsigned Opc = MI->getOpcode(); - int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); - if (Src0Idx == -1) - return false; - - // FIXME: Workaround TargetInstrInfo::commuteInstruction asserting on - // immediate. - if (!MI->getOperand(Src0Idx).isReg()) - return false; - - int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1); - if (Src1Idx == -1) - return false; - - if (!MI->getOperand(Src1Idx).isReg()) - return false; - - // If any source modifiers are set, the generic instruction commuting won't - // understand how to copy the source modifiers. - if (hasModifiersSet(*MI, AMDGPU::OpName::src0_modifiers) || - hasModifiersSet(*MI, AMDGPU::OpName::src1_modifiers)) - return false; - - SrcOpIdx1 = Src0Idx; - SrcOpIdx2 = Src1Idx; - return true; -} - -MachineInstr *SIInstrInfo::buildMovInstr(MachineBasicBlock *MBB, - MachineBasicBlock::iterator I, - unsigned DstReg, - unsigned SrcReg) const { - return BuildMI(*MBB, I, MBB->findDebugLoc(I), get(AMDGPU::V_MOV_B32_e32), - DstReg) .addReg(SrcReg); -} - -bool SIInstrInfo::isMov(unsigned Opcode) const { - switch(Opcode) { - default: return false; - case AMDGPU::S_MOV_B32: - case AMDGPU::S_MOV_B64: - case AMDGPU::V_MOV_B32_e32: - case AMDGPU::V_MOV_B32_e64: - return true; - } -} - -bool -SIInstrInfo::isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const { - return RC != &AMDGPU::EXECRegRegClass; -} - -static void removeModOperands(MachineInstr &MI) { - unsigned Opc = MI.getOpcode(); - int Src0ModIdx = AMDGPU::getNamedOperandIdx(Opc, - AMDGPU::OpName::src0_modifiers); - int Src1ModIdx = AMDGPU::getNamedOperandIdx(Opc, - AMDGPU::OpName::src1_modifiers); - int Src2ModIdx = AMDGPU::getNamedOperandIdx(Opc, - AMDGPU::OpName::src2_modifiers); - - MI.RemoveOperand(Src2ModIdx); - MI.RemoveOperand(Src1ModIdx); - MI.RemoveOperand(Src0ModIdx); -} - -bool SIInstrInfo::FoldImmediate(MachineInstr *UseMI, MachineInstr *DefMI, - unsigned Reg, MachineRegisterInfo *MRI) const { - if (!MRI->hasOneNonDBGUse(Reg)) - return false; - - unsigned Opc = UseMI->getOpcode(); - if (Opc == AMDGPU::V_MAD_F32) { - // Don't fold if we are using source modifiers. The new VOP2 instructions - // don't have them. - if (hasModifiersSet(*UseMI, AMDGPU::OpName::src0_modifiers) || - hasModifiersSet(*UseMI, AMDGPU::OpName::src1_modifiers) || - hasModifiersSet(*UseMI, AMDGPU::OpName::src2_modifiers)) { - return false; - } - - MachineOperand *Src0 = getNamedOperand(*UseMI, AMDGPU::OpName::src0); - MachineOperand *Src1 = getNamedOperand(*UseMI, AMDGPU::OpName::src1); - MachineOperand *Src2 = getNamedOperand(*UseMI, AMDGPU::OpName::src2); - - // Multiplied part is the constant: Use v_madmk_f32 - // We should only expect these to be on src0 due to canonicalizations. - if (Src0->isReg() && Src0->getReg() == Reg) { - if (!Src1->isReg() || - (Src1->isReg() && RI.isSGPRClass(MRI->getRegClass(Src1->getReg())))) - return false; - - if (!Src2->isReg() || - (Src2->isReg() && RI.isSGPRClass(MRI->getRegClass(Src2->getReg())))) - return false; - - // We need to do some weird looking operand shuffling since the madmk - // operands are out of the normal expected order with the multiplied - // constant as the last operand. - // - // v_mad_f32 src0, src1, src2 -> v_madmk_f32 src0 * src2K + src1 - // src0 -> src2 K - // src1 -> src0 - // src2 -> src1 - - const int64_t Imm = DefMI->getOperand(1).getImm(); - - // FIXME: This would be a lot easier if we could return a new instruction - // instead of having to modify in place. - - // Remove these first since they are at the end. - UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(AMDGPU::V_MAD_F32, - AMDGPU::OpName::omod)); - UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(AMDGPU::V_MAD_F32, - AMDGPU::OpName::clamp)); - - unsigned Src1Reg = Src1->getReg(); - unsigned Src1SubReg = Src1->getSubReg(); - unsigned Src2Reg = Src2->getReg(); - unsigned Src2SubReg = Src2->getSubReg(); - Src0->setReg(Src1Reg); - Src0->setSubReg(Src1SubReg); - Src0->setIsKill(Src1->isKill()); - - Src1->setReg(Src2Reg); - Src1->setSubReg(Src2SubReg); - Src1->setIsKill(Src2->isKill()); - - Src2->ChangeToImmediate(Imm); - - removeModOperands(*UseMI); - UseMI->setDesc(get(AMDGPU::V_MADMK_F32)); - - bool DeleteDef = MRI->hasOneNonDBGUse(Reg); - if (DeleteDef) - DefMI->eraseFromParent(); - - return true; - } - - // Added part is the constant: Use v_madak_f32 - if (Src2->isReg() && Src2->getReg() == Reg) { - // Not allowed to use constant bus for another operand. - // We can however allow an inline immediate as src0. - if (!Src0->isImm() && - (Src0->isReg() && RI.isSGPRClass(MRI->getRegClass(Src0->getReg())))) - return false; - - if (!Src1->isReg() || - (Src1->isReg() && RI.isSGPRClass(MRI->getRegClass(Src1->getReg())))) - return false; - - const int64_t Imm = DefMI->getOperand(1).getImm(); - - // FIXME: This would be a lot easier if we could return a new instruction - // instead of having to modify in place. - - // Remove these first since they are at the end. - UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(AMDGPU::V_MAD_F32, - AMDGPU::OpName::omod)); - UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(AMDGPU::V_MAD_F32, - AMDGPU::OpName::clamp)); - - Src2->ChangeToImmediate(Imm); - - // These come before src2. - removeModOperands(*UseMI); - UseMI->setDesc(get(AMDGPU::V_MADAK_F32)); - - bool DeleteDef = MRI->hasOneNonDBGUse(Reg); - if (DeleteDef) - DefMI->eraseFromParent(); - - return true; - } - } - - return false; -} - -bool -SIInstrInfo::isTriviallyReMaterializable(const MachineInstr *MI, - AliasAnalysis *AA) const { - switch(MI->getOpcode()) { - default: return AMDGPUInstrInfo::isTriviallyReMaterializable(MI, AA); - case AMDGPU::S_MOV_B32: - case AMDGPU::S_MOV_B64: - case AMDGPU::V_MOV_B32_e32: - return MI->getOperand(1).isImm(); - } -} - -static bool offsetsDoNotOverlap(int WidthA, int OffsetA, - int WidthB, int OffsetB) { - int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB; - int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA; - int LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB; - return LowOffset + LowWidth <= HighOffset; -} - -bool SIInstrInfo::checkInstOffsetsDoNotOverlap(MachineInstr *MIa, - MachineInstr *MIb) const { - unsigned BaseReg0, Offset0; - unsigned BaseReg1, Offset1; - - if (getLdStBaseRegImmOfs(MIa, BaseReg0, Offset0, &RI) && - getLdStBaseRegImmOfs(MIb, BaseReg1, Offset1, &RI)) { - assert(MIa->hasOneMemOperand() && MIb->hasOneMemOperand() && - "read2 / write2 not expected here yet"); - unsigned Width0 = (*MIa->memoperands_begin())->getSize(); - unsigned Width1 = (*MIb->memoperands_begin())->getSize(); - if (BaseReg0 == BaseReg1 && - offsetsDoNotOverlap(Width0, Offset0, Width1, Offset1)) { - return true; - } - } - - return false; -} - -bool SIInstrInfo::areMemAccessesTriviallyDisjoint(MachineInstr *MIa, - MachineInstr *MIb, - AliasAnalysis *AA) const { - unsigned Opc0 = MIa->getOpcode(); - unsigned Opc1 = MIb->getOpcode(); - - assert(MIa && (MIa->mayLoad() || MIa->mayStore()) && - "MIa must load from or modify a memory location"); - assert(MIb && (MIb->mayLoad() || MIb->mayStore()) && - "MIb must load from or modify a memory location"); - - if (MIa->hasUnmodeledSideEffects() || MIb->hasUnmodeledSideEffects()) - return false; - - // XXX - Can we relax this between address spaces? - if (MIa->hasOrderedMemoryRef() || MIb->hasOrderedMemoryRef()) - return false; - - // TODO: Should we check the address space from the MachineMemOperand? That - // would allow us to distinguish objects we know don't alias based on the - // underlying addres space, even if it was lowered to a different one, - // e.g. private accesses lowered to use MUBUF instructions on a scratch - // buffer. - if (isDS(Opc0)) { - if (isDS(Opc1)) - return checkInstOffsetsDoNotOverlap(MIa, MIb); - - return !isFLAT(Opc1); - } - - if (isMUBUF(Opc0) || isMTBUF(Opc0)) { - if (isMUBUF(Opc1) || isMTBUF(Opc1)) - return checkInstOffsetsDoNotOverlap(MIa, MIb); - - return !isFLAT(Opc1) && !isSMRD(Opc1); - } - - if (isSMRD(Opc0)) { - if (isSMRD(Opc1)) - return checkInstOffsetsDoNotOverlap(MIa, MIb); - - return !isFLAT(Opc1) && !isMUBUF(Opc0) && !isMTBUF(Opc0); - } - - if (isFLAT(Opc0)) { - if (isFLAT(Opc1)) - return checkInstOffsetsDoNotOverlap(MIa, MIb); - - return false; - } - - return false; -} - -bool SIInstrInfo::isInlineConstant(const APInt &Imm) const { - int64_t SVal = Imm.getSExtValue(); - if (SVal >= -16 && SVal <= 64) - return true; - - if (Imm.getBitWidth() == 64) { - uint64_t Val = Imm.getZExtValue(); - return (DoubleToBits(0.0) == Val) || - (DoubleToBits(1.0) == Val) || - (DoubleToBits(-1.0) == Val) || - (DoubleToBits(0.5) == Val) || - (DoubleToBits(-0.5) == Val) || - (DoubleToBits(2.0) == Val) || - (DoubleToBits(-2.0) == Val) || - (DoubleToBits(4.0) == Val) || - (DoubleToBits(-4.0) == Val); - } - - // The actual type of the operand does not seem to matter as long - // as the bits match one of the inline immediate values. For example: - // - // -nan has the hexadecimal encoding of 0xfffffffe which is -2 in decimal, - // so it is a legal inline immediate. - // - // 1065353216 has the hexadecimal encoding 0x3f800000 which is 1.0f in - // floating-point, so it is a legal inline immediate. - uint32_t Val = Imm.getZExtValue(); - - return (FloatToBits(0.0f) == Val) || - (FloatToBits(1.0f) == Val) || - (FloatToBits(-1.0f) == Val) || - (FloatToBits(0.5f) == Val) || - (FloatToBits(-0.5f) == Val) || - (FloatToBits(2.0f) == Val) || - (FloatToBits(-2.0f) == Val) || - (FloatToBits(4.0f) == Val) || - (FloatToBits(-4.0f) == Val); -} - -bool SIInstrInfo::isInlineConstant(const MachineOperand &MO, - unsigned OpSize) const { - if (MO.isImm()) { - // MachineOperand provides no way to tell the true operand size, since it - // only records a 64-bit value. We need to know the size to determine if a - // 32-bit floating point immediate bit pattern is legal for an integer - // immediate. It would be for any 32-bit integer operand, but would not be - // for a 64-bit one. - - unsigned BitSize = 8 * OpSize; - return isInlineConstant(APInt(BitSize, MO.getImm(), true)); - } - - return false; -} - -bool SIInstrInfo::isLiteralConstant(const MachineOperand &MO, - unsigned OpSize) const { - return MO.isImm() && !isInlineConstant(MO, OpSize); -} - -static bool compareMachineOp(const MachineOperand &Op0, - const MachineOperand &Op1) { - if (Op0.getType() != Op1.getType()) - return false; - - switch (Op0.getType()) { - case MachineOperand::MO_Register: - return Op0.getReg() == Op1.getReg(); - case MachineOperand::MO_Immediate: - return Op0.getImm() == Op1.getImm(); - default: - llvm_unreachable("Didn't expect to be comparing these operand types"); - } -} - -bool SIInstrInfo::isImmOperandLegal(const MachineInstr *MI, unsigned OpNo, - const MachineOperand &MO) const { - const MCOperandInfo &OpInfo = get(MI->getOpcode()).OpInfo[OpNo]; - - assert(MO.isImm() || MO.isTargetIndex() || MO.isFI()); - - if (OpInfo.OperandType == MCOI::OPERAND_IMMEDIATE) - return true; - - if (OpInfo.RegClass < 0) - return false; - - unsigned OpSize = RI.getRegClass(OpInfo.RegClass)->getSize(); - if (isLiteralConstant(MO, OpSize)) - return RI.opCanUseLiteralConstant(OpInfo.OperandType); - - return RI.opCanUseInlineConstant(OpInfo.OperandType); -} - -bool SIInstrInfo::hasVALU32BitEncoding(unsigned Opcode) const { - int Op32 = AMDGPU::getVOPe32(Opcode); - if (Op32 == -1) - return false; - - return pseudoToMCOpcode(Op32) != -1; -} - -bool SIInstrInfo::hasModifiers(unsigned Opcode) const { - // The src0_modifier operand is present on all instructions - // that have modifiers. - - return AMDGPU::getNamedOperandIdx(Opcode, - AMDGPU::OpName::src0_modifiers) != -1; -} - -bool SIInstrInfo::hasModifiersSet(const MachineInstr &MI, - unsigned OpName) const { - const MachineOperand *Mods = getNamedOperand(MI, OpName); - return Mods && Mods->getImm(); -} - -bool SIInstrInfo::usesConstantBus(const MachineRegisterInfo &MRI, - const MachineOperand &MO, - unsigned OpSize) const { - // Literal constants use the constant bus. - if (isLiteralConstant(MO, OpSize)) - return true; - - if (!MO.isReg() || !MO.isUse()) - return false; - - if (TargetRegisterInfo::isVirtualRegister(MO.getReg())) - return RI.isSGPRClass(MRI.getRegClass(MO.getReg())); - - // FLAT_SCR is just an SGPR pair. - if (!MO.isImplicit() && (MO.getReg() == AMDGPU::FLAT_SCR)) - return true; - - // EXEC register uses the constant bus. - if (!MO.isImplicit() && MO.getReg() == AMDGPU::EXEC) - return true; - - // SGPRs use the constant bus - if (MO.getReg() == AMDGPU::M0 || MO.getReg() == AMDGPU::VCC || - (!MO.isImplicit() && - (AMDGPU::SGPR_32RegClass.contains(MO.getReg()) || - AMDGPU::SGPR_64RegClass.contains(MO.getReg())))) { - return true; - } - - return false; -} - -bool SIInstrInfo::verifyInstruction(const MachineInstr *MI, - StringRef &ErrInfo) const { - uint16_t Opcode = MI->getOpcode(); - const MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); - int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0); - int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1); - int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2); - - // Make sure the number of operands is correct. - const MCInstrDesc &Desc = get(Opcode); - if (!Desc.isVariadic() && - Desc.getNumOperands() != MI->getNumExplicitOperands()) { - ErrInfo = "Instruction has wrong number of operands."; - return false; - } - - // Make sure the register classes are correct - for (int i = 0, e = Desc.getNumOperands(); i != e; ++i) { - if (MI->getOperand(i).isFPImm()) { - ErrInfo = "FPImm Machine Operands are not supported. ISel should bitcast " - "all fp values to integers."; - return false; - } - - int RegClass = Desc.OpInfo[i].RegClass; - - switch (Desc.OpInfo[i].OperandType) { - case MCOI::OPERAND_REGISTER: - if (MI->getOperand(i).isImm()) { - ErrInfo = "Illegal immediate value for operand."; - return false; - } - break; - case AMDGPU::OPERAND_REG_IMM32: - break; - case AMDGPU::OPERAND_REG_INLINE_C: - if (isLiteralConstant(MI->getOperand(i), - RI.getRegClass(RegClass)->getSize())) { - ErrInfo = "Illegal immediate value for operand."; - return false; - } - break; - case MCOI::OPERAND_IMMEDIATE: - // Check if this operand is an immediate. - // FrameIndex operands will be replaced by immediates, so they are - // allowed. - if (!MI->getOperand(i).isImm() && !MI->getOperand(i).isFI()) { - ErrInfo = "Expected immediate, but got non-immediate"; - return false; - } - // Fall-through - default: - continue; - } - - if (!MI->getOperand(i).isReg()) - continue; - - if (RegClass != -1) { - unsigned Reg = MI->getOperand(i).getReg(); - if (TargetRegisterInfo::isVirtualRegister(Reg)) - continue; - - const TargetRegisterClass *RC = RI.getRegClass(RegClass); - if (!RC->contains(Reg)) { - ErrInfo = "Operand has incorrect register class."; - return false; - } - } - } - - - // Verify VOP* - if (isVOP1(Opcode) || isVOP2(Opcode) || isVOP3(Opcode) || isVOPC(Opcode)) { - // Only look at the true operands. Only a real operand can use the constant - // bus, and we don't want to check pseudo-operands like the source modifier - // flags. - const int OpIndices[] = { Src0Idx, Src1Idx, Src2Idx }; - - unsigned ConstantBusCount = 0; - unsigned SGPRUsed = AMDGPU::NoRegister; - for (int OpIdx : OpIndices) { - if (OpIdx == -1) - break; - const MachineOperand &MO = MI->getOperand(OpIdx); - if (usesConstantBus(MRI, MO, getOpSize(Opcode, OpIdx))) { - if (MO.isReg()) { - if (MO.getReg() != SGPRUsed) - ++ConstantBusCount; - SGPRUsed = MO.getReg(); - } else { - ++ConstantBusCount; - } - } - } - if (ConstantBusCount > 1) { - ErrInfo = "VOP* instruction uses the constant bus more than once"; - return false; - } - } - - // Verify misc. restrictions on specific instructions. - if (Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F32 || - Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F64) { - const MachineOperand &Src0 = MI->getOperand(Src0Idx); - const MachineOperand &Src1 = MI->getOperand(Src1Idx); - const MachineOperand &Src2 = MI->getOperand(Src2Idx); - if (Src0.isReg() && Src1.isReg() && Src2.isReg()) { - if (!compareMachineOp(Src0, Src1) && - !compareMachineOp(Src0, Src2)) { - ErrInfo = "v_div_scale_{f32|f64} require src0 = src1 or src2"; - return false; - } - } - } - - return true; -} - -unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) { - switch (MI.getOpcode()) { - default: return AMDGPU::INSTRUCTION_LIST_END; - case AMDGPU::REG_SEQUENCE: return AMDGPU::REG_SEQUENCE; - case AMDGPU::COPY: return AMDGPU::COPY; - case AMDGPU::PHI: return AMDGPU::PHI; - case AMDGPU::INSERT_SUBREG: return AMDGPU::INSERT_SUBREG; - case AMDGPU::S_MOV_B32: - return MI.getOperand(1).isReg() ? - AMDGPU::COPY : AMDGPU::V_MOV_B32_e32; - case AMDGPU::S_ADD_I32: - case AMDGPU::S_ADD_U32: return AMDGPU::V_ADD_I32_e32; - case AMDGPU::S_ADDC_U32: return AMDGPU::V_ADDC_U32_e32; - case AMDGPU::S_SUB_I32: - case AMDGPU::S_SUB_U32: return AMDGPU::V_SUB_I32_e32; - case AMDGPU::S_SUBB_U32: return AMDGPU::V_SUBB_U32_e32; - case AMDGPU::S_MUL_I32: return AMDGPU::V_MUL_LO_I32; - case AMDGPU::S_AND_B32: return AMDGPU::V_AND_B32_e32; - case AMDGPU::S_OR_B32: return AMDGPU::V_OR_B32_e32; - case AMDGPU::S_XOR_B32: return AMDGPU::V_XOR_B32_e32; - case AMDGPU::S_MIN_I32: return AMDGPU::V_MIN_I32_e32; - case AMDGPU::S_MIN_U32: return AMDGPU::V_MIN_U32_e32; - case AMDGPU::S_MAX_I32: return AMDGPU::V_MAX_I32_e32; - case AMDGPU::S_MAX_U32: return AMDGPU::V_MAX_U32_e32; - case AMDGPU::S_ASHR_I32: return AMDGPU::V_ASHR_I32_e32; - case AMDGPU::S_ASHR_I64: return AMDGPU::V_ASHR_I64; - case AMDGPU::S_LSHL_B32: return AMDGPU::V_LSHL_B32_e32; - case AMDGPU::S_LSHL_B64: return AMDGPU::V_LSHL_B64; - case AMDGPU::S_LSHR_B32: return AMDGPU::V_LSHR_B32_e32; - case AMDGPU::S_LSHR_B64: return AMDGPU::V_LSHR_B64; - case AMDGPU::S_SEXT_I32_I8: return AMDGPU::V_BFE_I32; - case AMDGPU::S_SEXT_I32_I16: return AMDGPU::V_BFE_I32; - case AMDGPU::S_BFE_U32: return AMDGPU::V_BFE_U32; - case AMDGPU::S_BFE_I32: return AMDGPU::V_BFE_I32; - case AMDGPU::S_BFM_B32: return AMDGPU::V_BFM_B32_e64; - case AMDGPU::S_BREV_B32: return AMDGPU::V_BFREV_B32_e32; - case AMDGPU::S_NOT_B32: return AMDGPU::V_NOT_B32_e32; - case AMDGPU::S_NOT_B64: return AMDGPU::V_NOT_B32_e32; - case AMDGPU::S_CMP_EQ_I32: return AMDGPU::V_CMP_EQ_I32_e32; - case AMDGPU::S_CMP_LG_I32: return AMDGPU::V_CMP_NE_I32_e32; - case AMDGPU::S_CMP_GT_I32: return AMDGPU::V_CMP_GT_I32_e32; - case AMDGPU::S_CMP_GE_I32: return AMDGPU::V_CMP_GE_I32_e32; - case AMDGPU::S_CMP_LT_I32: return AMDGPU::V_CMP_LT_I32_e32; - case AMDGPU::S_CMP_LE_I32: return AMDGPU::V_CMP_LE_I32_e32; - case AMDGPU::S_LOAD_DWORD_IMM: - case AMDGPU::S_LOAD_DWORD_SGPR: return AMDGPU::BUFFER_LOAD_DWORD_ADDR64; - case AMDGPU::S_LOAD_DWORDX2_IMM: - case AMDGPU::S_LOAD_DWORDX2_SGPR: return AMDGPU::BUFFER_LOAD_DWORDX2_ADDR64; - case AMDGPU::S_LOAD_DWORDX4_IMM: - case AMDGPU::S_LOAD_DWORDX4_SGPR: return AMDGPU::BUFFER_LOAD_DWORDX4_ADDR64; - case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e64; - case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32; - case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32; - case AMDGPU::S_FLBIT_I32: return AMDGPU::V_FFBH_I32_e64; - } -} - -bool SIInstrInfo::isSALUOpSupportedOnVALU(const MachineInstr &MI) const { - return getVALUOp(MI) != AMDGPU::INSTRUCTION_LIST_END; -} - -const TargetRegisterClass *SIInstrInfo::getOpRegClass(const MachineInstr &MI, - unsigned OpNo) const { - const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); - const MCInstrDesc &Desc = get(MI.getOpcode()); - if (MI.isVariadic() || OpNo >= Desc.getNumOperands() || - Desc.OpInfo[OpNo].RegClass == -1) { - unsigned Reg = MI.getOperand(OpNo).getReg(); - - if (TargetRegisterInfo::isVirtualRegister(Reg)) - return MRI.getRegClass(Reg); - return RI.getPhysRegClass(Reg); - } - - unsigned RCID = Desc.OpInfo[OpNo].RegClass; - return RI.getRegClass(RCID); -} - -bool SIInstrInfo::canReadVGPR(const MachineInstr &MI, unsigned OpNo) const { - switch (MI.getOpcode()) { - case AMDGPU::COPY: - case AMDGPU::REG_SEQUENCE: - case AMDGPU::PHI: - case AMDGPU::INSERT_SUBREG: - return RI.hasVGPRs(getOpRegClass(MI, 0)); - default: - return RI.hasVGPRs(getOpRegClass(MI, OpNo)); - } -} - -void SIInstrInfo::legalizeOpWithMove(MachineInstr *MI, unsigned OpIdx) const { - MachineBasicBlock::iterator I = MI; - MachineBasicBlock *MBB = MI->getParent(); - MachineOperand &MO = MI->getOperand(OpIdx); - MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); - unsigned RCID = get(MI->getOpcode()).OpInfo[OpIdx].RegClass; - const TargetRegisterClass *RC = RI.getRegClass(RCID); - unsigned Opcode = AMDGPU::V_MOV_B32_e32; - if (MO.isReg()) - Opcode = AMDGPU::COPY; - else if (RI.isSGPRClass(RC)) - Opcode = AMDGPU::S_MOV_B32; - - - const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(RC); - if (RI.getCommonSubClass(&AMDGPU::VReg_64RegClass, VRC)) - VRC = &AMDGPU::VReg_64RegClass; - else - VRC = &AMDGPU::VGPR_32RegClass; - - unsigned Reg = MRI.createVirtualRegister(VRC); - DebugLoc DL = MBB->findDebugLoc(I); - BuildMI(*MI->getParent(), I, DL, get(Opcode), Reg) - .addOperand(MO); - MO.ChangeToRegister(Reg, false); -} - -unsigned SIInstrInfo::buildExtractSubReg(MachineBasicBlock::iterator MI, - MachineRegisterInfo &MRI, - MachineOperand &SuperReg, - const TargetRegisterClass *SuperRC, - unsigned SubIdx, - const TargetRegisterClass *SubRC) - const { - assert(SuperReg.isReg()); - - unsigned NewSuperReg = MRI.createVirtualRegister(SuperRC); - unsigned SubReg = MRI.createVirtualRegister(SubRC); - - // Just in case the super register is itself a sub-register, copy it to a new - // value so we don't need to worry about merging its subreg index with the - // SubIdx passed to this function. The register coalescer should be able to - // eliminate this extra copy. - MachineBasicBlock *MBB = MI->getParent(); - DebugLoc DL = MI->getDebugLoc(); - - BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), NewSuperReg) - .addReg(SuperReg.getReg(), 0, SuperReg.getSubReg()); - - BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg) - .addReg(NewSuperReg, 0, SubIdx); - - return SubReg; -} - -MachineOperand SIInstrInfo::buildExtractSubRegOrImm( - MachineBasicBlock::iterator MII, - MachineRegisterInfo &MRI, - MachineOperand &Op, - const TargetRegisterClass *SuperRC, - unsigned SubIdx, - const TargetRegisterClass *SubRC) const { - if (Op.isImm()) { - // XXX - Is there a better way to do this? - if (SubIdx == AMDGPU::sub0) - return MachineOperand::CreateImm(Op.getImm() & 0xFFFFFFFF); - if (SubIdx == AMDGPU::sub1) - return MachineOperand::CreateImm(Op.getImm() >> 32); - - llvm_unreachable("Unhandled register index for immediate"); - } - - unsigned SubReg = buildExtractSubReg(MII, MRI, Op, SuperRC, - SubIdx, SubRC); - return MachineOperand::CreateReg(SubReg, false); -} - -unsigned SIInstrInfo::split64BitImm(SmallVectorImpl &Worklist, - MachineBasicBlock::iterator MI, - MachineRegisterInfo &MRI, - const TargetRegisterClass *RC, - const MachineOperand &Op) const { - MachineBasicBlock *MBB = MI->getParent(); - DebugLoc DL = MI->getDebugLoc(); - unsigned LoDst = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); - unsigned HiDst = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); - unsigned Dst = MRI.createVirtualRegister(RC); - - MachineInstr *Lo = BuildMI(*MBB, MI, DL, get(AMDGPU::S_MOV_B32), - LoDst) - .addImm(Op.getImm() & 0xFFFFFFFF); - MachineInstr *Hi = BuildMI(*MBB, MI, DL, get(AMDGPU::S_MOV_B32), - HiDst) - .addImm(Op.getImm() >> 32); - - BuildMI(*MBB, MI, DL, get(TargetOpcode::REG_SEQUENCE), Dst) - .addReg(LoDst) - .addImm(AMDGPU::sub0) - .addReg(HiDst) - .addImm(AMDGPU::sub1); - - Worklist.push_back(Lo); - Worklist.push_back(Hi); - - return Dst; -} - -// Change the order of operands from (0, 1, 2) to (0, 2, 1) -void SIInstrInfo::swapOperands(MachineBasicBlock::iterator Inst) const { - assert(Inst->getNumExplicitOperands() == 3); - MachineOperand Op1 = Inst->getOperand(1); - Inst->RemoveOperand(1); - Inst->addOperand(Op1); -} - -bool SIInstrInfo::isOperandLegal(const MachineInstr *MI, unsigned OpIdx, - const MachineOperand *MO) const { - const MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); - const MCInstrDesc &InstDesc = get(MI->getOpcode()); - const MCOperandInfo &OpInfo = InstDesc.OpInfo[OpIdx]; - const TargetRegisterClass *DefinedRC = - OpInfo.RegClass != -1 ? RI.getRegClass(OpInfo.RegClass) : nullptr; - if (!MO) - MO = &MI->getOperand(OpIdx); - - if (isVALU(InstDesc.Opcode) && - usesConstantBus(MRI, *MO, DefinedRC->getSize())) { - unsigned SGPRUsed = - MO->isReg() ? MO->getReg() : (unsigned)AMDGPU::NoRegister; - for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { - if (i == OpIdx) - continue; - const MachineOperand &Op = MI->getOperand(i); - if (Op.isReg() && Op.getReg() != SGPRUsed && - usesConstantBus(MRI, Op, getOpSize(*MI, i))) { - return false; - } - } - } - - if (MO->isReg()) { - assert(DefinedRC); - const TargetRegisterClass *RC = MRI.getRegClass(MO->getReg()); - - // In order to be legal, the common sub-class must be equal to the - // class of the current operand. For example: - // - // v_mov_b32 s0 ; Operand defined as vsrc_32 - // ; RI.getCommonSubClass(s0,vsrc_32) = sgpr ; LEGAL - // - // s_sendmsg 0, s0 ; Operand defined as m0reg - // ; RI.getCommonSubClass(s0,m0reg) = m0reg ; NOT LEGAL - - return RI.getCommonSubClass(RC, RI.getRegClass(OpInfo.RegClass)) == RC; - } - - - // Handle non-register types that are treated like immediates. - assert(MO->isImm() || MO->isTargetIndex() || MO->isFI()); - - if (!DefinedRC) { - // This operand expects an immediate. - return true; - } - - return isImmOperandLegal(MI, OpIdx, *MO); -} - -void SIInstrInfo::legalizeOperands(MachineInstr *MI) const { - MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); - - int Src0Idx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), - AMDGPU::OpName::src0); - int Src1Idx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), - AMDGPU::OpName::src1); - int Src2Idx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), - AMDGPU::OpName::src2); - - // Legalize VOP2 - if (isVOP2(MI->getOpcode()) && Src1Idx != -1) { - // Legalize src0 - if (!isOperandLegal(MI, Src0Idx)) - legalizeOpWithMove(MI, Src0Idx); - - // Legalize src1 - if (isOperandLegal(MI, Src1Idx)) - return; - - // Usually src0 of VOP2 instructions allow more types of inputs - // than src1, so try to commute the instruction to decrease our - // chances of having to insert a MOV instruction to legalize src1. - if (MI->isCommutable()) { - if (commuteInstruction(MI)) - // If we are successful in commuting, then we know MI is legal, so - // we are done. - return; - } - - legalizeOpWithMove(MI, Src1Idx); - return; - } - - // XXX - Do any VOP3 instructions read VCC? - // Legalize VOP3 - if (isVOP3(MI->getOpcode())) { - int VOP3Idx[3] = { Src0Idx, Src1Idx, Src2Idx }; - - // Find the one SGPR operand we are allowed to use. - unsigned SGPRReg = findUsedSGPR(MI, VOP3Idx); - - for (unsigned i = 0; i < 3; ++i) { - int Idx = VOP3Idx[i]; - if (Idx == -1) - break; - MachineOperand &MO = MI->getOperand(Idx); - - if (MO.isReg()) { - if (!RI.isSGPRClass(MRI.getRegClass(MO.getReg()))) - continue; // VGPRs are legal - - assert(MO.getReg() != AMDGPU::SCC && "SCC operand to VOP3 instruction"); - - if (SGPRReg == AMDGPU::NoRegister || SGPRReg == MO.getReg()) { - SGPRReg = MO.getReg(); - // We can use one SGPR in each VOP3 instruction. - continue; - } - } else if (!isLiteralConstant(MO, getOpSize(MI->getOpcode(), Idx))) { - // If it is not a register and not a literal constant, then it must be - // an inline constant which is always legal. - continue; - } - // If we make it this far, then the operand is not legal and we must - // legalize it. - legalizeOpWithMove(MI, Idx); - } - } - - // Legalize REG_SEQUENCE and PHI - // The register class of the operands much be the same type as the register - // class of the output. - if (MI->getOpcode() == AMDGPU::REG_SEQUENCE || - MI->getOpcode() == AMDGPU::PHI) { - const TargetRegisterClass *RC = nullptr, *SRC = nullptr, *VRC = nullptr; - for (unsigned i = 1, e = MI->getNumOperands(); i != e; i+=2) { - if (!MI->getOperand(i).isReg() || - !TargetRegisterInfo::isVirtualRegister(MI->getOperand(i).getReg())) - continue; - const TargetRegisterClass *OpRC = - MRI.getRegClass(MI->getOperand(i).getReg()); - if (RI.hasVGPRs(OpRC)) { - VRC = OpRC; - } else { - SRC = OpRC; - } - } - - // If any of the operands are VGPR registers, then they all most be - // otherwise we will create illegal VGPR->SGPR copies when legalizing - // them. - if (VRC || !RI.isSGPRClass(getOpRegClass(*MI, 0))) { - if (!VRC) { - assert(SRC); - VRC = RI.getEquivalentVGPRClass(SRC); - } - RC = VRC; - } else { - RC = SRC; - } - - // Update all the operands so they have the same type. - for (unsigned i = 1, e = MI->getNumOperands(); i != e; i+=2) { - if (!MI->getOperand(i).isReg() || - !TargetRegisterInfo::isVirtualRegister(MI->getOperand(i).getReg())) - continue; - unsigned DstReg = MRI.createVirtualRegister(RC); - MachineBasicBlock *InsertBB; - MachineBasicBlock::iterator Insert; - if (MI->getOpcode() == AMDGPU::REG_SEQUENCE) { - InsertBB = MI->getParent(); - Insert = MI; - } else { - // MI is a PHI instruction. - InsertBB = MI->getOperand(i + 1).getMBB(); - Insert = InsertBB->getFirstTerminator(); - } - BuildMI(*InsertBB, Insert, MI->getDebugLoc(), - get(AMDGPU::COPY), DstReg) - .addOperand(MI->getOperand(i)); - MI->getOperand(i).setReg(DstReg); - } - } - - // Legalize INSERT_SUBREG - // src0 must have the same register class as dst - if (MI->getOpcode() == AMDGPU::INSERT_SUBREG) { - unsigned Dst = MI->getOperand(0).getReg(); - unsigned Src0 = MI->getOperand(1).getReg(); - const TargetRegisterClass *DstRC = MRI.getRegClass(Dst); - const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0); - if (DstRC != Src0RC) { - MachineBasicBlock &MBB = *MI->getParent(); - unsigned NewSrc0 = MRI.createVirtualRegister(DstRC); - BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::COPY), NewSrc0) - .addReg(Src0); - MI->getOperand(1).setReg(NewSrc0); - } - return; - } - - // Legalize MUBUF* instructions - // FIXME: If we start using the non-addr64 instructions for compute, we - // may need to legalize them here. - int SRsrcIdx = - AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::srsrc); - if (SRsrcIdx != -1) { - // We have an MUBUF instruction - MachineOperand *SRsrc = &MI->getOperand(SRsrcIdx); - unsigned SRsrcRC = get(MI->getOpcode()).OpInfo[SRsrcIdx].RegClass; - if (RI.getCommonSubClass(MRI.getRegClass(SRsrc->getReg()), - RI.getRegClass(SRsrcRC))) { - // The operands are legal. - // FIXME: We may need to legalize operands besided srsrc. - return; - } - - MachineBasicBlock &MBB = *MI->getParent(); - // Extract the the ptr from the resource descriptor. - - // SRsrcPtrLo = srsrc:sub0 - unsigned SRsrcPtrLo = buildExtractSubReg(MI, MRI, *SRsrc, - &AMDGPU::VReg_128RegClass, AMDGPU::sub0, &AMDGPU::VGPR_32RegClass); - - // SRsrcPtrHi = srsrc:sub1 - unsigned SRsrcPtrHi = buildExtractSubReg(MI, MRI, *SRsrc, - &AMDGPU::VReg_128RegClass, AMDGPU::sub1, &AMDGPU::VGPR_32RegClass); - - // Create an empty resource descriptor - unsigned Zero64 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); - unsigned SRsrcFormatLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); - unsigned SRsrcFormatHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); - unsigned NewSRsrc = MRI.createVirtualRegister(&AMDGPU::SReg_128RegClass); - uint64_t RsrcDataFormat = getDefaultRsrcDataFormat(); - - // Zero64 = 0 - BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B64), - Zero64) - .addImm(0); - - // SRsrcFormatLo = RSRC_DATA_FORMAT{31-0} - BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), - SRsrcFormatLo) - .addImm(RsrcDataFormat & 0xFFFFFFFF); - - // SRsrcFormatHi = RSRC_DATA_FORMAT{63-32} - BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), - SRsrcFormatHi) - .addImm(RsrcDataFormat >> 32); - - // NewSRsrc = {Zero64, SRsrcFormat} - BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), - NewSRsrc) - .addReg(Zero64) - .addImm(AMDGPU::sub0_sub1) - .addReg(SRsrcFormatLo) - .addImm(AMDGPU::sub2) - .addReg(SRsrcFormatHi) - .addImm(AMDGPU::sub3); - - MachineOperand *VAddr = getNamedOperand(*MI, AMDGPU::OpName::vaddr); - unsigned NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); - unsigned NewVAddrLo; - unsigned NewVAddrHi; - if (VAddr) { - // This is already an ADDR64 instruction so we need to add the pointer - // extracted from the resource descriptor to the current value of VAddr. - NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); - NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); - - // NewVaddrLo = SRsrcPtrLo + VAddr:sub0 - BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::V_ADD_I32_e32), - NewVAddrLo) - .addReg(SRsrcPtrLo) - .addReg(VAddr->getReg(), 0, AMDGPU::sub0) - .addReg(AMDGPU::VCC, RegState::ImplicitDefine); - - // NewVaddrHi = SRsrcPtrHi + VAddr:sub1 - BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::V_ADDC_U32_e32), - NewVAddrHi) - .addReg(SRsrcPtrHi) - .addReg(VAddr->getReg(), 0, AMDGPU::sub1) - .addReg(AMDGPU::VCC, RegState::ImplicitDefine) - .addReg(AMDGPU::VCC, RegState::Implicit); - - } else { - // This instructions is the _OFFSET variant, so we need to convert it to - // ADDR64. - MachineOperand *VData = getNamedOperand(*MI, AMDGPU::OpName::vdata); - MachineOperand *Offset = getNamedOperand(*MI, AMDGPU::OpName::offset); - MachineOperand *SOffset = getNamedOperand(*MI, AMDGPU::OpName::soffset); - - // Create the new instruction. - unsigned Addr64Opcode = AMDGPU::getAddr64Inst(MI->getOpcode()); - MachineInstr *Addr64 = - BuildMI(MBB, MI, MI->getDebugLoc(), get(Addr64Opcode)) - .addOperand(*VData) - .addReg(AMDGPU::NoRegister) // Dummy value for vaddr. - // This will be replaced later - // with the new value of vaddr. - .addOperand(*SRsrc) - .addOperand(*SOffset) - .addOperand(*Offset) - .addImm(0) // glc - .addImm(0) // slc - .addImm(0); // tfe - - MI->removeFromParent(); - MI = Addr64; - - NewVAddrLo = SRsrcPtrLo; - NewVAddrHi = SRsrcPtrHi; - VAddr = getNamedOperand(*MI, AMDGPU::OpName::vaddr); - SRsrc = getNamedOperand(*MI, AMDGPU::OpName::srsrc); - } - - // NewVaddr = {NewVaddrHi, NewVaddrLo} - BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), - NewVAddr) - .addReg(NewVAddrLo) - .addImm(AMDGPU::sub0) - .addReg(NewVAddrHi) - .addImm(AMDGPU::sub1); - - - // Update the instruction to use NewVaddr - VAddr->setReg(NewVAddr); - // Update the instruction to use NewSRsrc - SRsrc->setReg(NewSRsrc); - } -} - -void SIInstrInfo::splitSMRD(MachineInstr *MI, - const TargetRegisterClass *HalfRC, - unsigned HalfImmOp, unsigned HalfSGPROp, - MachineInstr *&Lo, MachineInstr *&Hi) const { - - DebugLoc DL = MI->getDebugLoc(); - MachineBasicBlock *MBB = MI->getParent(); - MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); - unsigned RegLo = MRI.createVirtualRegister(HalfRC); - unsigned RegHi = MRI.createVirtualRegister(HalfRC); - unsigned HalfSize = HalfRC->getSize(); - const MachineOperand *OffOp = - getNamedOperand(*MI, AMDGPU::OpName::offset); - const MachineOperand *SBase = getNamedOperand(*MI, AMDGPU::OpName::sbase); - - // The SMRD has an 8-bit offset in dwords on SI and a 20-bit offset in bytes - // on VI. - - bool IsKill = SBase->isKill(); - if (OffOp) { - bool isVI = - MBB->getParent()->getSubtarget().getGeneration() >= - AMDGPUSubtarget::VOLCANIC_ISLANDS; - unsigned OffScale = isVI ? 1 : 4; - // Handle the _IMM variant - unsigned LoOffset = OffOp->getImm() * OffScale; - unsigned HiOffset = LoOffset + HalfSize; - Lo = BuildMI(*MBB, MI, DL, get(HalfImmOp), RegLo) - // Use addReg instead of addOperand - // to make sure kill flag is cleared. - .addReg(SBase->getReg(), 0, SBase->getSubReg()) - .addImm(LoOffset / OffScale); - - if (!isUInt<20>(HiOffset) || (!isVI && !isUInt<8>(HiOffset / OffScale))) { - unsigned OffsetSGPR = - MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); - BuildMI(*MBB, MI, DL, get(AMDGPU::S_MOV_B32), OffsetSGPR) - .addImm(HiOffset); // The offset in register is in bytes. - Hi = BuildMI(*MBB, MI, DL, get(HalfSGPROp), RegHi) - .addReg(SBase->getReg(), getKillRegState(IsKill), - SBase->getSubReg()) - .addReg(OffsetSGPR); - } else { - Hi = BuildMI(*MBB, MI, DL, get(HalfImmOp), RegHi) - .addReg(SBase->getReg(), getKillRegState(IsKill), - SBase->getSubReg()) - .addImm(HiOffset / OffScale); - } - } else { - // Handle the _SGPR variant - MachineOperand *SOff = getNamedOperand(*MI, AMDGPU::OpName::soff); - Lo = BuildMI(*MBB, MI, DL, get(HalfSGPROp), RegLo) - .addReg(SBase->getReg(), 0, SBase->getSubReg()) - .addOperand(*SOff); - unsigned OffsetSGPR = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); - BuildMI(*MBB, MI, DL, get(AMDGPU::S_ADD_I32), OffsetSGPR) - .addOperand(*SOff) - .addImm(HalfSize); - Hi = BuildMI(*MBB, MI, DL, get(HalfSGPROp)) - .addReg(SBase->getReg(), getKillRegState(IsKill), - SBase->getSubReg()) - .addReg(OffsetSGPR); - } - - unsigned SubLo, SubHi; - switch (HalfSize) { - case 4: - SubLo = AMDGPU::sub0; - SubHi = AMDGPU::sub1; - break; - case 8: - SubLo = AMDGPU::sub0_sub1; - SubHi = AMDGPU::sub2_sub3; - break; - case 16: - SubLo = AMDGPU::sub0_sub1_sub2_sub3; - SubHi = AMDGPU::sub4_sub5_sub6_sub7; - break; - case 32: - SubLo = AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7; - SubHi = AMDGPU::sub8_sub9_sub10_sub11_sub12_sub13_sub14_sub15; - break; - default: - llvm_unreachable("Unhandled HalfSize"); - } - - BuildMI(*MBB, MI, DL, get(AMDGPU::REG_SEQUENCE)) - .addOperand(MI->getOperand(0)) - .addReg(RegLo) - .addImm(SubLo) - .addReg(RegHi) - .addImm(SubHi); -} - -void SIInstrInfo::moveSMRDToVALU(MachineInstr *MI, MachineRegisterInfo &MRI) const { - MachineBasicBlock *MBB = MI->getParent(); - switch (MI->getOpcode()) { - case AMDGPU::S_LOAD_DWORD_IMM: - case AMDGPU::S_LOAD_DWORD_SGPR: - case AMDGPU::S_LOAD_DWORDX2_IMM: - case AMDGPU::S_LOAD_DWORDX2_SGPR: - case AMDGPU::S_LOAD_DWORDX4_IMM: - case AMDGPU::S_LOAD_DWORDX4_SGPR: { - unsigned NewOpcode = getVALUOp(*MI); - unsigned RegOffset; - unsigned ImmOffset; - - if (MI->getOperand(2).isReg()) { - RegOffset = MI->getOperand(2).getReg(); - ImmOffset = 0; - } else { - assert(MI->getOperand(2).isImm()); - // SMRD instructions take a dword offsets on SI and byte offset on VI - // and MUBUF instructions always take a byte offset. - ImmOffset = MI->getOperand(2).getImm(); - if (MBB->getParent()->getSubtarget().getGeneration() <= - AMDGPUSubtarget::SEA_ISLANDS) - ImmOffset <<= 2; - RegOffset = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); - - if (isUInt<12>(ImmOffset)) { - BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), - RegOffset) - .addImm(0); - } else { - BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), - RegOffset) - .addImm(ImmOffset); - ImmOffset = 0; - } - } - - unsigned SRsrc = MRI.createVirtualRegister(&AMDGPU::SReg_128RegClass); - unsigned DWord0 = RegOffset; - unsigned DWord1 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); - unsigned DWord2 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); - unsigned DWord3 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); - uint64_t RsrcDataFormat = getDefaultRsrcDataFormat(); - - BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), DWord1) - .addImm(0); - BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), DWord2) - .addImm(RsrcDataFormat & 0xFFFFFFFF); - BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), DWord3) - .addImm(RsrcDataFormat >> 32); - BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), SRsrc) - .addReg(DWord0) - .addImm(AMDGPU::sub0) - .addReg(DWord1) - .addImm(AMDGPU::sub1) - .addReg(DWord2) - .addImm(AMDGPU::sub2) - .addReg(DWord3) - .addImm(AMDGPU::sub3); - MI->setDesc(get(NewOpcode)); - if (MI->getOperand(2).isReg()) { - MI->getOperand(2).setReg(SRsrc); - } else { - MI->getOperand(2).ChangeToRegister(SRsrc, false); - } - MI->addOperand(*MBB->getParent(), MachineOperand::CreateImm(0)); - MI->addOperand(*MBB->getParent(), MachineOperand::CreateImm(ImmOffset)); - MI->addOperand(*MBB->getParent(), MachineOperand::CreateImm(0)); // glc - MI->addOperand(*MBB->getParent(), MachineOperand::CreateImm(0)); // slc - MI->addOperand(*MBB->getParent(), MachineOperand::CreateImm(0)); // tfe - - const TargetRegisterClass *NewDstRC = - RI.getRegClass(get(NewOpcode).OpInfo[0].RegClass); - - unsigned DstReg = MI->getOperand(0).getReg(); - unsigned NewDstReg = MRI.createVirtualRegister(NewDstRC); - MRI.replaceRegWith(DstReg, NewDstReg); - break; - } - case AMDGPU::S_LOAD_DWORDX8_IMM: - case AMDGPU::S_LOAD_DWORDX8_SGPR: { - MachineInstr *Lo, *Hi; - splitSMRD(MI, &AMDGPU::SReg_128RegClass, AMDGPU::S_LOAD_DWORDX4_IMM, - AMDGPU::S_LOAD_DWORDX4_SGPR, Lo, Hi); - MI->eraseFromParent(); - moveSMRDToVALU(Lo, MRI); - moveSMRDToVALU(Hi, MRI); - break; - } - - case AMDGPU::S_LOAD_DWORDX16_IMM: - case AMDGPU::S_LOAD_DWORDX16_SGPR: { - MachineInstr *Lo, *Hi; - splitSMRD(MI, &AMDGPU::SReg_256RegClass, AMDGPU::S_LOAD_DWORDX8_IMM, - AMDGPU::S_LOAD_DWORDX8_SGPR, Lo, Hi); - MI->eraseFromParent(); - moveSMRDToVALU(Lo, MRI); - moveSMRDToVALU(Hi, MRI); - break; - } - } -} - -void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const { - SmallVector Worklist; - Worklist.push_back(&TopInst); - - while (!Worklist.empty()) { - MachineInstr *Inst = Worklist.pop_back_val(); - MachineBasicBlock *MBB = Inst->getParent(); - MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); - - unsigned Opcode = Inst->getOpcode(); - unsigned NewOpcode = getVALUOp(*Inst); - - // Handle some special cases - switch (Opcode) { - default: - if (isSMRD(Inst->getOpcode())) { - moveSMRDToVALU(Inst, MRI); - } - break; - case AMDGPU::S_MOV_B64: { - DebugLoc DL = Inst->getDebugLoc(); - - // If the source operand is a register we can replace this with a - // copy. - if (Inst->getOperand(1).isReg()) { - MachineInstr *Copy = BuildMI(*MBB, Inst, DL, get(TargetOpcode::COPY)) - .addOperand(Inst->getOperand(0)) - .addOperand(Inst->getOperand(1)); - Worklist.push_back(Copy); - } else { - // Otherwise, we need to split this into two movs, because there is - // no 64-bit VALU move instruction. - unsigned Reg = Inst->getOperand(0).getReg(); - unsigned Dst = split64BitImm(Worklist, - Inst, - MRI, - MRI.getRegClass(Reg), - Inst->getOperand(1)); - MRI.replaceRegWith(Reg, Dst); - } - Inst->eraseFromParent(); - continue; - } - case AMDGPU::S_AND_B64: - splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_AND_B32); - Inst->eraseFromParent(); - continue; - - case AMDGPU::S_OR_B64: - splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_OR_B32); - Inst->eraseFromParent(); - continue; - - case AMDGPU::S_XOR_B64: - splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XOR_B32); - Inst->eraseFromParent(); - continue; - - case AMDGPU::S_NOT_B64: - splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_NOT_B32); - Inst->eraseFromParent(); - continue; - - case AMDGPU::S_BCNT1_I32_B64: - splitScalar64BitBCNT(Worklist, Inst); - Inst->eraseFromParent(); - continue; - - case AMDGPU::S_BFE_I64: { - splitScalar64BitBFE(Worklist, Inst); - Inst->eraseFromParent(); - continue; - } - - case AMDGPU::S_LSHL_B32: - if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { - NewOpcode = AMDGPU::V_LSHLREV_B32_e64; - swapOperands(Inst); - } - break; - case AMDGPU::S_ASHR_I32: - if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { - NewOpcode = AMDGPU::V_ASHRREV_I32_e64; - swapOperands(Inst); - } - break; - case AMDGPU::S_LSHR_B32: - if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { - NewOpcode = AMDGPU::V_LSHRREV_B32_e64; - swapOperands(Inst); - } - break; - case AMDGPU::S_LSHL_B64: - if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { - NewOpcode = AMDGPU::V_LSHLREV_B64; - swapOperands(Inst); - } - break; - case AMDGPU::S_ASHR_I64: - if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { - NewOpcode = AMDGPU::V_ASHRREV_I64; - swapOperands(Inst); - } - break; - case AMDGPU::S_LSHR_B64: - if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { - NewOpcode = AMDGPU::V_LSHRREV_B64; - swapOperands(Inst); - } - break; - - case AMDGPU::S_BFE_U64: - case AMDGPU::S_BFM_B64: - llvm_unreachable("Moving this op to VALU not implemented"); - } - - if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) { - // We cannot move this instruction to the VALU, so we should try to - // legalize its operands instead. - legalizeOperands(Inst); - continue; - } - - // Use the new VALU Opcode. - const MCInstrDesc &NewDesc = get(NewOpcode); - Inst->setDesc(NewDesc); - - // Remove any references to SCC. Vector instructions can't read from it, and - // We're just about to add the implicit use / defs of VCC, and we don't want - // both. - for (unsigned i = Inst->getNumOperands() - 1; i > 0; --i) { - MachineOperand &Op = Inst->getOperand(i); - if (Op.isReg() && Op.getReg() == AMDGPU::SCC) - Inst->RemoveOperand(i); - } - - if (Opcode == AMDGPU::S_SEXT_I32_I8 || Opcode == AMDGPU::S_SEXT_I32_I16) { - // We are converting these to a BFE, so we need to add the missing - // operands for the size and offset. - unsigned Size = (Opcode == AMDGPU::S_SEXT_I32_I8) ? 8 : 16; - Inst->addOperand(MachineOperand::CreateImm(0)); - Inst->addOperand(MachineOperand::CreateImm(Size)); - - } else if (Opcode == AMDGPU::S_BCNT1_I32_B32) { - // The VALU version adds the second operand to the result, so insert an - // extra 0 operand. - Inst->addOperand(MachineOperand::CreateImm(0)); - } - - addDescImplicitUseDef(NewDesc, Inst); - - if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) { - const MachineOperand &OffsetWidthOp = Inst->getOperand(2); - // If we need to move this to VGPRs, we need to unpack the second operand - // back into the 2 separate ones for bit offset and width. - assert(OffsetWidthOp.isImm() && - "Scalar BFE is only implemented for constant width and offset"); - uint32_t Imm = OffsetWidthOp.getImm(); - - uint32_t Offset = Imm & 0x3f; // Extract bits [5:0]. - uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16]. - Inst->RemoveOperand(2); // Remove old immediate. - Inst->addOperand(MachineOperand::CreateImm(Offset)); - Inst->addOperand(MachineOperand::CreateImm(BitWidth)); - } - - // Update the destination register class. - - const TargetRegisterClass *NewDstRC = getOpRegClass(*Inst, 0); - - switch (Opcode) { - // For target instructions, getOpRegClass just returns the virtual - // register class associated with the operand, so we need to find an - // equivalent VGPR register class in order to move the instruction to the - // VALU. - case AMDGPU::COPY: - case AMDGPU::PHI: - case AMDGPU::REG_SEQUENCE: - case AMDGPU::INSERT_SUBREG: - if (RI.hasVGPRs(NewDstRC)) - continue; - NewDstRC = RI.getEquivalentVGPRClass(NewDstRC); - if (!NewDstRC) - continue; - break; - default: - break; - } - - unsigned DstReg = Inst->getOperand(0).getReg(); - unsigned NewDstReg = MRI.createVirtualRegister(NewDstRC); - MRI.replaceRegWith(DstReg, NewDstReg); - - // Legalize the operands - legalizeOperands(Inst); - - for (MachineRegisterInfo::use_iterator I = MRI.use_begin(NewDstReg), - E = MRI.use_end(); I != E; ++I) { - MachineInstr &UseMI = *I->getParent(); - if (!canReadVGPR(UseMI, I.getOperandNo())) { - Worklist.push_back(&UseMI); - } - } - } -} - -//===----------------------------------------------------------------------===// -// Indirect addressing callbacks -//===----------------------------------------------------------------------===// - -unsigned SIInstrInfo::calculateIndirectAddress(unsigned RegIndex, - unsigned Channel) const { - assert(Channel == 0); - return RegIndex; -} - -const TargetRegisterClass *SIInstrInfo::getIndirectAddrRegClass() const { - return &AMDGPU::VGPR_32RegClass; -} - -void SIInstrInfo::splitScalar64BitUnaryOp( - SmallVectorImpl &Worklist, - MachineInstr *Inst, - unsigned Opcode) const { - MachineBasicBlock &MBB = *Inst->getParent(); - MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); - - MachineOperand &Dest = Inst->getOperand(0); - MachineOperand &Src0 = Inst->getOperand(1); - DebugLoc DL = Inst->getDebugLoc(); - - MachineBasicBlock::iterator MII = Inst; - - const MCInstrDesc &InstDesc = get(Opcode); - const TargetRegisterClass *Src0RC = Src0.isReg() ? - MRI.getRegClass(Src0.getReg()) : - &AMDGPU::SGPR_32RegClass; - - const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0); - - MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, - AMDGPU::sub0, Src0SubRC); - - const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg()); - const TargetRegisterClass *DestSubRC = RI.getSubRegClass(DestRC, AMDGPU::sub0); - - unsigned DestSub0 = MRI.createVirtualRegister(DestRC); - MachineInstr *LoHalf = BuildMI(MBB, MII, DL, InstDesc, DestSub0) - .addOperand(SrcReg0Sub0); - - MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, - AMDGPU::sub1, Src0SubRC); - - unsigned DestSub1 = MRI.createVirtualRegister(DestSubRC); - MachineInstr *HiHalf = BuildMI(MBB, MII, DL, InstDesc, DestSub1) - .addOperand(SrcReg0Sub1); - - unsigned FullDestReg = MRI.createVirtualRegister(DestRC); - BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg) - .addReg(DestSub0) - .addImm(AMDGPU::sub0) - .addReg(DestSub1) - .addImm(AMDGPU::sub1); - - MRI.replaceRegWith(Dest.getReg(), FullDestReg); - - // Try to legalize the operands in case we need to swap the order to keep it - // valid. - Worklist.push_back(LoHalf); - Worklist.push_back(HiHalf); -} - -void SIInstrInfo::splitScalar64BitBinaryOp( - SmallVectorImpl &Worklist, - MachineInstr *Inst, - unsigned Opcode) const { - MachineBasicBlock &MBB = *Inst->getParent(); - MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); - - MachineOperand &Dest = Inst->getOperand(0); - MachineOperand &Src0 = Inst->getOperand(1); - MachineOperand &Src1 = Inst->getOperand(2); - DebugLoc DL = Inst->getDebugLoc(); - - MachineBasicBlock::iterator MII = Inst; - - const MCInstrDesc &InstDesc = get(Opcode); - const TargetRegisterClass *Src0RC = Src0.isReg() ? - MRI.getRegClass(Src0.getReg()) : - &AMDGPU::SGPR_32RegClass; - - const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0); - const TargetRegisterClass *Src1RC = Src1.isReg() ? - MRI.getRegClass(Src1.getReg()) : - &AMDGPU::SGPR_32RegClass; - - const TargetRegisterClass *Src1SubRC = RI.getSubRegClass(Src1RC, AMDGPU::sub0); - - MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, - AMDGPU::sub0, Src0SubRC); - MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, - AMDGPU::sub0, Src1SubRC); - - const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg()); - const TargetRegisterClass *DestSubRC = RI.getSubRegClass(DestRC, AMDGPU::sub0); - - unsigned DestSub0 = MRI.createVirtualRegister(DestRC); - MachineInstr *LoHalf = BuildMI(MBB, MII, DL, InstDesc, DestSub0) - .addOperand(SrcReg0Sub0) - .addOperand(SrcReg1Sub0); - - MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, - AMDGPU::sub1, Src0SubRC); - MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, - AMDGPU::sub1, Src1SubRC); - - unsigned DestSub1 = MRI.createVirtualRegister(DestSubRC); - MachineInstr *HiHalf = BuildMI(MBB, MII, DL, InstDesc, DestSub1) - .addOperand(SrcReg0Sub1) - .addOperand(SrcReg1Sub1); - - unsigned FullDestReg = MRI.createVirtualRegister(DestRC); - BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg) - .addReg(DestSub0) - .addImm(AMDGPU::sub0) - .addReg(DestSub1) - .addImm(AMDGPU::sub1); - - MRI.replaceRegWith(Dest.getReg(), FullDestReg); - - // Try to legalize the operands in case we need to swap the order to keep it - // valid. - Worklist.push_back(LoHalf); - Worklist.push_back(HiHalf); -} - -void SIInstrInfo::splitScalar64BitBCNT(SmallVectorImpl &Worklist, - MachineInstr *Inst) const { - MachineBasicBlock &MBB = *Inst->getParent(); - MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); - - MachineBasicBlock::iterator MII = Inst; - DebugLoc DL = Inst->getDebugLoc(); - - MachineOperand &Dest = Inst->getOperand(0); - MachineOperand &Src = Inst->getOperand(1); - - const MCInstrDesc &InstDesc = get(AMDGPU::V_BCNT_U32_B32_e64); - const TargetRegisterClass *SrcRC = Src.isReg() ? - MRI.getRegClass(Src.getReg()) : - &AMDGPU::SGPR_32RegClass; - - unsigned MidReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); - unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); - - const TargetRegisterClass *SrcSubRC = RI.getSubRegClass(SrcRC, AMDGPU::sub0); - - MachineOperand SrcRegSub0 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, - AMDGPU::sub0, SrcSubRC); - MachineOperand SrcRegSub1 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, - AMDGPU::sub1, SrcSubRC); - - MachineInstr *First = BuildMI(MBB, MII, DL, InstDesc, MidReg) - .addOperand(SrcRegSub0) - .addImm(0); - - MachineInstr *Second = BuildMI(MBB, MII, DL, InstDesc, ResultReg) - .addOperand(SrcRegSub1) - .addReg(MidReg); - - MRI.replaceRegWith(Dest.getReg(), ResultReg); - - Worklist.push_back(First); - Worklist.push_back(Second); -} - -void SIInstrInfo::splitScalar64BitBFE(SmallVectorImpl &Worklist, - MachineInstr *Inst) const { - MachineBasicBlock &MBB = *Inst->getParent(); - MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); - MachineBasicBlock::iterator MII = Inst; - DebugLoc DL = Inst->getDebugLoc(); - - MachineOperand &Dest = Inst->getOperand(0); - uint32_t Imm = Inst->getOperand(2).getImm(); - uint32_t Offset = Imm & 0x3f; // Extract bits [5:0]. - uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16]. - - (void) Offset; - - // Only sext_inreg cases handled. - assert(Inst->getOpcode() == AMDGPU::S_BFE_I64 && - BitWidth <= 32 && - Offset == 0 && - "Not implemented"); - - if (BitWidth < 32) { - unsigned MidRegLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); - unsigned MidRegHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); - unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); - - BuildMI(MBB, MII, DL, get(AMDGPU::V_BFE_I32), MidRegLo) - .addReg(Inst->getOperand(1).getReg(), 0, AMDGPU::sub0) - .addImm(0) - .addImm(BitWidth); - - BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e32), MidRegHi) - .addImm(31) - .addReg(MidRegLo); - - BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg) - .addReg(MidRegLo) - .addImm(AMDGPU::sub0) - .addReg(MidRegHi) - .addImm(AMDGPU::sub1); - - MRI.replaceRegWith(Dest.getReg(), ResultReg); - return; - } - - MachineOperand &Src = Inst->getOperand(1); - unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); - unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); - - BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e64), TmpReg) - .addImm(31) - .addReg(Src.getReg(), 0, AMDGPU::sub0); - - BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg) - .addReg(Src.getReg(), 0, AMDGPU::sub0) - .addImm(AMDGPU::sub0) - .addReg(TmpReg) - .addImm(AMDGPU::sub1); - - MRI.replaceRegWith(Dest.getReg(), ResultReg); -} - -void SIInstrInfo::addDescImplicitUseDef(const MCInstrDesc &NewDesc, - MachineInstr *Inst) const { - // Add the implict and explicit register definitions. - if (NewDesc.ImplicitUses) { - for (unsigned i = 0; NewDesc.ImplicitUses[i]; ++i) { - unsigned Reg = NewDesc.ImplicitUses[i]; - Inst->addOperand(MachineOperand::CreateReg(Reg, false, true)); - } - } - - if (NewDesc.ImplicitDefs) { - for (unsigned i = 0; NewDesc.ImplicitDefs[i]; ++i) { - unsigned Reg = NewDesc.ImplicitDefs[i]; - Inst->addOperand(MachineOperand::CreateReg(Reg, true, true)); - } - } -} - -unsigned SIInstrInfo::findUsedSGPR(const MachineInstr *MI, - int OpIndices[3]) const { - const MCInstrDesc &Desc = get(MI->getOpcode()); - - // Find the one SGPR operand we are allowed to use. - unsigned SGPRReg = AMDGPU::NoRegister; - - // First we need to consider the instruction's operand requirements before - // legalizing. Some operands are required to be SGPRs, such as implicit uses - // of VCC, but we are still bound by the constant bus requirement to only use - // one. - // - // If the operand's class is an SGPR, we can never move it. - - for (const MachineOperand &MO : MI->implicit_operands()) { - // We only care about reads. - if (MO.isDef()) - continue; - - if (MO.getReg() == AMDGPU::VCC) - return AMDGPU::VCC; - - if (MO.getReg() == AMDGPU::FLAT_SCR) - return AMDGPU::FLAT_SCR; - } - - unsigned UsedSGPRs[3] = { AMDGPU::NoRegister }; - const MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); - - for (unsigned i = 0; i < 3; ++i) { - int Idx = OpIndices[i]; - if (Idx == -1) - break; - - const MachineOperand &MO = MI->getOperand(Idx); - if (RI.isSGPRClassID(Desc.OpInfo[Idx].RegClass)) - SGPRReg = MO.getReg(); - - if (MO.isReg() && RI.isSGPRClass(MRI.getRegClass(MO.getReg()))) - UsedSGPRs[i] = MO.getReg(); - } - - if (SGPRReg != AMDGPU::NoRegister) - return SGPRReg; - - // We don't have a required SGPR operand, so we have a bit more freedom in - // selecting operands to move. - - // Try to select the most used SGPR. If an SGPR is equal to one of the - // others, we choose that. - // - // e.g. - // V_FMA_F32 v0, s0, s0, s0 -> No moves - // V_FMA_F32 v0, s0, s1, s0 -> Move s1 - - if (UsedSGPRs[0] != AMDGPU::NoRegister) { - if (UsedSGPRs[0] == UsedSGPRs[1] || UsedSGPRs[0] == UsedSGPRs[2]) - SGPRReg = UsedSGPRs[0]; - } - - if (SGPRReg == AMDGPU::NoRegister && UsedSGPRs[1] != AMDGPU::NoRegister) { - if (UsedSGPRs[1] == UsedSGPRs[2]) - SGPRReg = UsedSGPRs[1]; - } - - return SGPRReg; -} - -MachineInstrBuilder SIInstrInfo::buildIndirectWrite( - MachineBasicBlock *MBB, - MachineBasicBlock::iterator I, - unsigned ValueReg, - unsigned Address, unsigned OffsetReg) const { - const DebugLoc &DL = MBB->findDebugLoc(I); - unsigned IndirectBaseReg = AMDGPU::VGPR_32RegClass.getRegister( - getIndirectIndexBegin(*MBB->getParent())); - - return BuildMI(*MBB, I, DL, get(AMDGPU::SI_INDIRECT_DST_V1)) - .addReg(IndirectBaseReg, RegState::Define) - .addOperand(I->getOperand(0)) - .addReg(IndirectBaseReg) - .addReg(OffsetReg) - .addImm(0) - .addReg(ValueReg); -} - -MachineInstrBuilder SIInstrInfo::buildIndirectRead( - MachineBasicBlock *MBB, - MachineBasicBlock::iterator I, - unsigned ValueReg, - unsigned Address, unsigned OffsetReg) const { - const DebugLoc &DL = MBB->findDebugLoc(I); - unsigned IndirectBaseReg = AMDGPU::VGPR_32RegClass.getRegister( - getIndirectIndexBegin(*MBB->getParent())); - - return BuildMI(*MBB, I, DL, get(AMDGPU::SI_INDIRECT_SRC)) - .addOperand(I->getOperand(0)) - .addOperand(I->getOperand(1)) - .addReg(IndirectBaseReg) - .addReg(OffsetReg) - .addImm(0); - -} - -void SIInstrInfo::reserveIndirectRegisters(BitVector &Reserved, - const MachineFunction &MF) const { - int End = getIndirectIndexEnd(MF); - int Begin = getIndirectIndexBegin(MF); - - if (End == -1) - return; - - - for (int Index = Begin; Index <= End; ++Index) - Reserved.set(AMDGPU::VGPR_32RegClass.getRegister(Index)); - - for (int Index = std::max(0, Begin - 1); Index <= End; ++Index) - Reserved.set(AMDGPU::VReg_64RegClass.getRegister(Index)); - - for (int Index = std::max(0, Begin - 2); Index <= End; ++Index) - Reserved.set(AMDGPU::VReg_96RegClass.getRegister(Index)); - - for (int Index = std::max(0, Begin - 3); Index <= End; ++Index) - Reserved.set(AMDGPU::VReg_128RegClass.getRegister(Index)); - - for (int Index = std::max(0, Begin - 7); Index <= End; ++Index) - Reserved.set(AMDGPU::VReg_256RegClass.getRegister(Index)); - - for (int Index = std::max(0, Begin - 15); Index <= End; ++Index) - Reserved.set(AMDGPU::VReg_512RegClass.getRegister(Index)); -} - -MachineOperand *SIInstrInfo::getNamedOperand(MachineInstr &MI, - unsigned OperandName) const { - int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OperandName); - if (Idx == -1) - return nullptr; - - return &MI.getOperand(Idx); -} - -uint64_t SIInstrInfo::getDefaultRsrcDataFormat() const { - uint64_t RsrcDataFormat = AMDGPU::RSRC_DATA_FORMAT; - if (ST.isAmdHsaOS()) - RsrcDataFormat |= (1ULL << 56); - - return RsrcDataFormat; -} Index: projects/clang370-import/contrib/llvm/lib/Target/R600/AMDGPUIntrinsicInfo.cpp =================================================================== --- projects/clang370-import/contrib/llvm/lib/Target/R600/AMDGPUIntrinsicInfo.cpp (revision 288077) +++ projects/clang370-import/contrib/llvm/lib/Target/R600/AMDGPUIntrinsicInfo.cpp (nonexistent) @@ -1,77 +0,0 @@ -//===- AMDGPUIntrinsicInfo.cpp - AMDGPU Intrinsic Information ---*- C++ -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//==-----------------------------------------------------------------------===// -// -/// \file -/// \brief AMDGPU Implementation of the IntrinsicInfo class. -// -//===-----------------------------------------------------------------------===// - -#include "AMDGPUIntrinsicInfo.h" -#include "AMDGPUSubtarget.h" -#include "llvm/IR/DerivedTypes.h" -#include "llvm/IR/Intrinsics.h" -#include "llvm/IR/Module.h" - -using namespace llvm; - -#define GET_LLVM_INTRINSIC_FOR_GCC_BUILTIN -#include "AMDGPUGenIntrinsics.inc" -#undef GET_LLVM_INTRINSIC_FOR_GCC_BUILTIN - -AMDGPUIntrinsicInfo::AMDGPUIntrinsicInfo() - : TargetIntrinsicInfo() {} - -std::string AMDGPUIntrinsicInfo::getName(unsigned IntrID, Type **Tys, - unsigned numTys) const { - static const char *const names[] = { -#define GET_INTRINSIC_NAME_TABLE -#include "AMDGPUGenIntrinsics.inc" -#undef GET_INTRINSIC_NAME_TABLE - }; - - if (IntrID < Intrinsic::num_intrinsics) { - return nullptr; - } - assert(IntrID < AMDGPUIntrinsic::num_AMDGPU_intrinsics && - "Invalid intrinsic ID"); - - std::string Result(names[IntrID - Intrinsic::num_intrinsics]); - return Result; -} - -unsigned AMDGPUIntrinsicInfo::lookupName(const char *Name, - unsigned Len) const { - if (!StringRef(Name, Len).startswith("llvm.")) - return 0; // All intrinsics start with 'llvm.' - -#define GET_FUNCTION_RECOGNIZER -#include "AMDGPUGenIntrinsics.inc" -#undef GET_FUNCTION_RECOGNIZER - AMDGPUIntrinsic::ID IntrinsicID = - (AMDGPUIntrinsic::ID)Intrinsic::not_intrinsic; - IntrinsicID = getIntrinsicForGCCBuiltin("AMDGPU", Name); - - if (IntrinsicID != (AMDGPUIntrinsic::ID)Intrinsic::not_intrinsic) { - return IntrinsicID; - } - return 0; -} - -bool AMDGPUIntrinsicInfo::isOverloaded(unsigned id) const { -// Overload Table -#define GET_INTRINSIC_OVERLOAD_TABLE -#include "AMDGPUGenIntrinsics.inc" -#undef GET_INTRINSIC_OVERLOAD_TABLE -} - -Function *AMDGPUIntrinsicInfo::getDeclaration(Module *M, unsigned IntrID, - Type **Tys, - unsigned numTys) const { - llvm_unreachable("Not implemented"); -} Index: projects/clang370-import/contrib/llvm/lib/Target/R600/R600ControlFlowFinalizer.cpp =================================================================== --- projects/clang370-import/contrib/llvm/lib/Target/R600/R600ControlFlowFinalizer.cpp (revision 288077) +++ projects/clang370-import/contrib/llvm/lib/Target/R600/R600ControlFlowFinalizer.cpp (nonexistent) @@ -1,679 +0,0 @@ -//===-- R600ControlFlowFinalizer.cpp - Finalize Control Flow Inst----------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -/// \file -/// This pass compute turns all control flow pseudo instructions into native one -/// computing their address on the fly ; it also sets STACK_SIZE info. -//===----------------------------------------------------------------------===// - -#include "llvm/Support/Debug.h" -#include "AMDGPU.h" -#include "AMDGPUSubtarget.h" -#include "R600Defines.h" -#include "R600InstrInfo.h" -#include "R600MachineFunctionInfo.h" -#include "R600RegisterInfo.h" -#include "llvm/CodeGen/MachineFunctionPass.h" -#include "llvm/CodeGen/MachineInstrBuilder.h" -#include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/Support/raw_ostream.h" - -using namespace llvm; - -#define DEBUG_TYPE "r600cf" - -namespace { - -struct CFStack { - - enum StackItem { - ENTRY = 0, - SUB_ENTRY = 1, - FIRST_NON_WQM_PUSH = 2, - FIRST_NON_WQM_PUSH_W_FULL_ENTRY = 3 - }; - - const AMDGPUSubtarget *ST; - std::vector BranchStack; - std::vector LoopStack; - unsigned MaxStackSize; - unsigned CurrentEntries; - unsigned CurrentSubEntries; - - CFStack(const AMDGPUSubtarget *st, unsigned ShaderType) : ST(st), - // We need to reserve a stack entry for CALL_FS in vertex shaders. - MaxStackSize(ShaderType == ShaderType::VERTEX ? 1 : 0), - CurrentEntries(0), CurrentSubEntries(0) { } - - unsigned getLoopDepth(); - bool branchStackContains(CFStack::StackItem); - bool requiresWorkAroundForInst(unsigned Opcode); - unsigned getSubEntrySize(CFStack::StackItem Item); - void updateMaxStackSize(); - void pushBranch(unsigned Opcode, bool isWQM = false); - void pushLoop(); - void popBranch(); - void popLoop(); -}; - -unsigned CFStack::getLoopDepth() { - return LoopStack.size(); -} - -bool CFStack::branchStackContains(CFStack::StackItem Item) { - for (std::vector::const_iterator I = BranchStack.begin(), - E = BranchStack.end(); I != E; ++I) { - if (*I == Item) - return true; - } - return false; -} - -bool CFStack::requiresWorkAroundForInst(unsigned Opcode) { - if (Opcode == AMDGPU::CF_ALU_PUSH_BEFORE && ST->hasCaymanISA() && - getLoopDepth() > 1) - return true; - - if (!ST->hasCFAluBug()) - return false; - - switch(Opcode) { - default: return false; - case AMDGPU::CF_ALU_PUSH_BEFORE: - case AMDGPU::CF_ALU_ELSE_AFTER: - case AMDGPU::CF_ALU_BREAK: - case AMDGPU::CF_ALU_CONTINUE: - if (CurrentSubEntries == 0) - return false; - if (ST->getWavefrontSize() == 64) { - // We are being conservative here. We only require this work-around if - // CurrentSubEntries > 3 && - // (CurrentSubEntries % 4 == 3 || CurrentSubEntries % 4 == 0) - // - // We have to be conservative, because we don't know for certain that - // our stack allocation algorithm for Evergreen/NI is correct. Applying this - // work-around when CurrentSubEntries > 3 allows us to over-allocate stack - // resources without any problems. - return CurrentSubEntries > 3; - } else { - assert(ST->getWavefrontSize() == 32); - // We are being conservative here. We only require the work-around if - // CurrentSubEntries > 7 && - // (CurrentSubEntries % 8 == 7 || CurrentSubEntries % 8 == 0) - // See the comment on the wavefront size == 64 case for why we are - // being conservative. - return CurrentSubEntries > 7; - } - } -} - -unsigned CFStack::getSubEntrySize(CFStack::StackItem Item) { - switch(Item) { - default: - return 0; - case CFStack::FIRST_NON_WQM_PUSH: - assert(!ST->hasCaymanISA()); - if (ST->getGeneration() <= AMDGPUSubtarget::R700) { - // +1 For the push operation. - // +2 Extra space required. - return 3; - } else { - // Some documentation says that this is not necessary on Evergreen, - // but experimentation has show that we need to allocate 1 extra - // sub-entry for the first non-WQM push. - // +1 For the push operation. - // +1 Extra space required. - return 2; - } - case CFStack::FIRST_NON_WQM_PUSH_W_FULL_ENTRY: - assert(ST->getGeneration() >= AMDGPUSubtarget::EVERGREEN); - // +1 For the push operation. - // +1 Extra space required. - return 2; - case CFStack::SUB_ENTRY: - return 1; - } -} - -void CFStack::updateMaxStackSize() { - unsigned CurrentStackSize = CurrentEntries + - (RoundUpToAlignment(CurrentSubEntries, 4) / 4); - MaxStackSize = std::max(CurrentStackSize, MaxStackSize); -} - -void CFStack::pushBranch(unsigned Opcode, bool isWQM) { - CFStack::StackItem Item = CFStack::ENTRY; - switch(Opcode) { - case AMDGPU::CF_PUSH_EG: - case AMDGPU::CF_ALU_PUSH_BEFORE: - if (!isWQM) { - if (!ST->hasCaymanISA() && - !branchStackContains(CFStack::FIRST_NON_WQM_PUSH)) - Item = CFStack::FIRST_NON_WQM_PUSH; // May not be required on Evergreen/NI - // See comment in - // CFStack::getSubEntrySize() - else if (CurrentEntries > 0 && - ST->getGeneration() > AMDGPUSubtarget::EVERGREEN && - !ST->hasCaymanISA() && - !branchStackContains(CFStack::FIRST_NON_WQM_PUSH_W_FULL_ENTRY)) - Item = CFStack::FIRST_NON_WQM_PUSH_W_FULL_ENTRY; - else - Item = CFStack::SUB_ENTRY; - } else - Item = CFStack::ENTRY; - break; - } - BranchStack.push_back(Item); - if (Item == CFStack::ENTRY) - CurrentEntries++; - else - CurrentSubEntries += getSubEntrySize(Item); - updateMaxStackSize(); -} - -void CFStack::pushLoop() { - LoopStack.push_back(CFStack::ENTRY); - CurrentEntries++; - updateMaxStackSize(); -} - -void CFStack::popBranch() { - CFStack::StackItem Top = BranchStack.back(); - if (Top == CFStack::ENTRY) - CurrentEntries--; - else - CurrentSubEntries-= getSubEntrySize(Top); - BranchStack.pop_back(); -} - -void CFStack::popLoop() { - CurrentEntries--; - LoopStack.pop_back(); -} - -class R600ControlFlowFinalizer : public MachineFunctionPass { - -private: - typedef std::pair > ClauseFile; - - enum ControlFlowInstruction { - CF_TC, - CF_VC, - CF_CALL_FS, - CF_WHILE_LOOP, - CF_END_LOOP, - CF_LOOP_BREAK, - CF_LOOP_CONTINUE, - CF_JUMP, - CF_ELSE, - CF_POP, - CF_END - }; - - static char ID; - const R600InstrInfo *TII; - const R600RegisterInfo *TRI; - unsigned MaxFetchInst; - const AMDGPUSubtarget *ST; - - bool IsTrivialInst(MachineInstr *MI) const { - switch (MI->getOpcode()) { - case AMDGPU::KILL: - case AMDGPU::RETURN: - return true; - default: - return false; - } - } - - const MCInstrDesc &getHWInstrDesc(ControlFlowInstruction CFI) const { - unsigned Opcode = 0; - bool isEg = (ST->getGeneration() >= AMDGPUSubtarget::EVERGREEN); - switch (CFI) { - case CF_TC: - Opcode = isEg ? AMDGPU::CF_TC_EG : AMDGPU::CF_TC_R600; - break; - case CF_VC: - Opcode = isEg ? AMDGPU::CF_VC_EG : AMDGPU::CF_VC_R600; - break; - case CF_CALL_FS: - Opcode = isEg ? AMDGPU::CF_CALL_FS_EG : AMDGPU::CF_CALL_FS_R600; - break; - case CF_WHILE_LOOP: - Opcode = isEg ? AMDGPU::WHILE_LOOP_EG : AMDGPU::WHILE_LOOP_R600; - break; - case CF_END_LOOP: - Opcode = isEg ? AMDGPU::END_LOOP_EG : AMDGPU::END_LOOP_R600; - break; - case CF_LOOP_BREAK: - Opcode = isEg ? AMDGPU::LOOP_BREAK_EG : AMDGPU::LOOP_BREAK_R600; - break; - case CF_LOOP_CONTINUE: - Opcode = isEg ? AMDGPU::CF_CONTINUE_EG : AMDGPU::CF_CONTINUE_R600; - break; - case CF_JUMP: - Opcode = isEg ? AMDGPU::CF_JUMP_EG : AMDGPU::CF_JUMP_R600; - break; - case CF_ELSE: - Opcode = isEg ? AMDGPU::CF_ELSE_EG : AMDGPU::CF_ELSE_R600; - break; - case CF_POP: - Opcode = isEg ? AMDGPU::POP_EG : AMDGPU::POP_R600; - break; - case CF_END: - if (ST->hasCaymanISA()) { - Opcode = AMDGPU::CF_END_CM; - break; - } - Opcode = isEg ? AMDGPU::CF_END_EG : AMDGPU::CF_END_R600; - break; - } - assert (Opcode && "No opcode selected"); - return TII->get(Opcode); - } - - bool isCompatibleWithClause(const MachineInstr *MI, - std::set &DstRegs) const { - unsigned DstMI, SrcMI; - for (MachineInstr::const_mop_iterator I = MI->operands_begin(), - E = MI->operands_end(); I != E; ++I) { - const MachineOperand &MO = *I; - if (!MO.isReg()) - continue; - if (MO.isDef()) { - unsigned Reg = MO.getReg(); - if (AMDGPU::R600_Reg128RegClass.contains(Reg)) - DstMI = Reg; - else - DstMI = TRI->getMatchingSuperReg(Reg, - TRI->getSubRegFromChannel(TRI->getHWRegChan(Reg)), - &AMDGPU::R600_Reg128RegClass); - } - if (MO.isUse()) { - unsigned Reg = MO.getReg(); - if (AMDGPU::R600_Reg128RegClass.contains(Reg)) - SrcMI = Reg; - else - SrcMI = TRI->getMatchingSuperReg(Reg, - TRI->getSubRegFromChannel(TRI->getHWRegChan(Reg)), - &AMDGPU::R600_Reg128RegClass); - } - } - if ((DstRegs.find(SrcMI) == DstRegs.end())) { - DstRegs.insert(DstMI); - return true; - } else - return false; - } - - ClauseFile - MakeFetchClause(MachineBasicBlock &MBB, MachineBasicBlock::iterator &I) - const { - MachineBasicBlock::iterator ClauseHead = I; - std::vector ClauseContent; - unsigned AluInstCount = 0; - bool IsTex = TII->usesTextureCache(ClauseHead); - std::set DstRegs; - for (MachineBasicBlock::iterator E = MBB.end(); I != E; ++I) { - if (IsTrivialInst(I)) - continue; - if (AluInstCount >= MaxFetchInst) - break; - if ((IsTex && !TII->usesTextureCache(I)) || - (!IsTex && !TII->usesVertexCache(I))) - break; - if (!isCompatibleWithClause(I, DstRegs)) - break; - AluInstCount ++; - ClauseContent.push_back(I); - } - MachineInstr *MIb = BuildMI(MBB, ClauseHead, MBB.findDebugLoc(ClauseHead), - getHWInstrDesc(IsTex?CF_TC:CF_VC)) - .addImm(0) // ADDR - .addImm(AluInstCount - 1); // COUNT - return ClauseFile(MIb, std::move(ClauseContent)); - } - - void getLiteral(MachineInstr *MI, std::vector &Lits) const { - static const unsigned LiteralRegs[] = { - AMDGPU::ALU_LITERAL_X, - AMDGPU::ALU_LITERAL_Y, - AMDGPU::ALU_LITERAL_Z, - AMDGPU::ALU_LITERAL_W - }; - const SmallVector, 3 > Srcs = - TII->getSrcs(MI); - for (unsigned i = 0, e = Srcs.size(); i < e; ++i) { - if (Srcs[i].first->getReg() != AMDGPU::ALU_LITERAL_X) - continue; - int64_t Imm = Srcs[i].second; - std::vector::iterator It = - std::find(Lits.begin(), Lits.end(), Imm); - if (It != Lits.end()) { - unsigned Index = It - Lits.begin(); - Srcs[i].first->setReg(LiteralRegs[Index]); - } else { - assert(Lits.size() < 4 && "Too many literals in Instruction Group"); - Srcs[i].first->setReg(LiteralRegs[Lits.size()]); - Lits.push_back(Imm); - } - } - } - - MachineBasicBlock::iterator insertLiterals( - MachineBasicBlock::iterator InsertPos, - const std::vector &Literals) const { - MachineBasicBlock *MBB = InsertPos->getParent(); - for (unsigned i = 0, e = Literals.size(); i < e; i+=2) { - unsigned LiteralPair0 = Literals[i]; - unsigned LiteralPair1 = (i + 1 < e)?Literals[i + 1]:0; - InsertPos = BuildMI(MBB, InsertPos->getDebugLoc(), - TII->get(AMDGPU::LITERALS)) - .addImm(LiteralPair0) - .addImm(LiteralPair1); - } - return InsertPos; - } - - ClauseFile - MakeALUClause(MachineBasicBlock &MBB, MachineBasicBlock::iterator &I) - const { - MachineBasicBlock::iterator ClauseHead = I; - std::vector ClauseContent; - I++; - for (MachineBasicBlock::instr_iterator E = MBB.instr_end(); I != E;) { - if (IsTrivialInst(I)) { - ++I; - continue; - } - if (!I->isBundle() && !TII->isALUInstr(I->getOpcode())) - break; - std::vector Literals; - if (I->isBundle()) { - MachineInstr *DeleteMI = I; - MachineBasicBlock::instr_iterator BI = I.getInstrIterator(); - while (++BI != E && BI->isBundledWithPred()) { - BI->unbundleFromPred(); - for (unsigned i = 0, e = BI->getNumOperands(); i != e; ++i) { - MachineOperand &MO = BI->getOperand(i); - if (MO.isReg() && MO.isInternalRead()) - MO.setIsInternalRead(false); - } - getLiteral(BI, Literals); - ClauseContent.push_back(BI); - } - I = BI; - DeleteMI->eraseFromParent(); - } else { - getLiteral(I, Literals); - ClauseContent.push_back(I); - I++; - } - for (unsigned i = 0, e = Literals.size(); i < e; i+=2) { - unsigned literal0 = Literals[i]; - unsigned literal2 = (i + 1 < e)?Literals[i + 1]:0; - MachineInstr *MILit = BuildMI(MBB, I, I->getDebugLoc(), - TII->get(AMDGPU::LITERALS)) - .addImm(literal0) - .addImm(literal2); - ClauseContent.push_back(MILit); - } - } - assert(ClauseContent.size() < 128 && "ALU clause is too big"); - ClauseHead->getOperand(7).setImm(ClauseContent.size() - 1); - return ClauseFile(ClauseHead, std::move(ClauseContent)); - } - - void - EmitFetchClause(MachineBasicBlock::iterator InsertPos, ClauseFile &Clause, - unsigned &CfCount) { - CounterPropagateAddr(Clause.first, CfCount); - MachineBasicBlock *BB = Clause.first->getParent(); - BuildMI(BB, InsertPos->getDebugLoc(), TII->get(AMDGPU::FETCH_CLAUSE)) - .addImm(CfCount); - for (unsigned i = 0, e = Clause.second.size(); i < e; ++i) { - BB->splice(InsertPos, BB, Clause.second[i]); - } - CfCount += 2 * Clause.second.size(); - } - - void - EmitALUClause(MachineBasicBlock::iterator InsertPos, ClauseFile &Clause, - unsigned &CfCount) { - Clause.first->getOperand(0).setImm(0); - CounterPropagateAddr(Clause.first, CfCount); - MachineBasicBlock *BB = Clause.first->getParent(); - BuildMI(BB, InsertPos->getDebugLoc(), TII->get(AMDGPU::ALU_CLAUSE)) - .addImm(CfCount); - for (unsigned i = 0, e = Clause.second.size(); i < e; ++i) { - BB->splice(InsertPos, BB, Clause.second[i]); - } - CfCount += Clause.second.size(); - } - - void CounterPropagateAddr(MachineInstr *MI, unsigned Addr) const { - MI->getOperand(0).setImm(Addr + MI->getOperand(0).getImm()); - } - void CounterPropagateAddr(const std::set &MIs, - unsigned Addr) const { - for (MachineInstr *MI : MIs) { - CounterPropagateAddr(MI, Addr); - } - } - -public: - R600ControlFlowFinalizer(TargetMachine &tm) - : MachineFunctionPass(ID), TII(nullptr), TRI(nullptr), ST(nullptr) {} - - bool runOnMachineFunction(MachineFunction &MF) override { - ST = &MF.getSubtarget(); - MaxFetchInst = ST->getTexVTXClauseSize(); - TII = static_cast(ST->getInstrInfo()); - TRI = static_cast(ST->getRegisterInfo()); - R600MachineFunctionInfo *MFI = MF.getInfo(); - - CFStack CFStack(ST, MFI->getShaderType()); - for (MachineFunction::iterator MB = MF.begin(), ME = MF.end(); MB != ME; - ++MB) { - MachineBasicBlock &MBB = *MB; - unsigned CfCount = 0; - std::vector > > LoopStack; - std::vector IfThenElseStack; - if (MFI->getShaderType() == ShaderType::VERTEX) { - BuildMI(MBB, MBB.begin(), MBB.findDebugLoc(MBB.begin()), - getHWInstrDesc(CF_CALL_FS)); - CfCount++; - } - std::vector FetchClauses, AluClauses; - std::vector LastAlu(1); - std::vector ToPopAfter; - - for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); - I != E;) { - if (TII->usesTextureCache(I) || TII->usesVertexCache(I)) { - DEBUG(dbgs() << CfCount << ":"; I->dump();); - FetchClauses.push_back(MakeFetchClause(MBB, I)); - CfCount++; - LastAlu.back() = nullptr; - continue; - } - - MachineBasicBlock::iterator MI = I; - if (MI->getOpcode() != AMDGPU::ENDIF) - LastAlu.back() = nullptr; - if (MI->getOpcode() == AMDGPU::CF_ALU) - LastAlu.back() = MI; - I++; - bool RequiresWorkAround = - CFStack.requiresWorkAroundForInst(MI->getOpcode()); - switch (MI->getOpcode()) { - case AMDGPU::CF_ALU_PUSH_BEFORE: - if (RequiresWorkAround) { - DEBUG(dbgs() << "Applying bug work-around for ALU_PUSH_BEFORE\n"); - BuildMI(MBB, MI, MBB.findDebugLoc(MI), TII->get(AMDGPU::CF_PUSH_EG)) - .addImm(CfCount + 1) - .addImm(1); - MI->setDesc(TII->get(AMDGPU::CF_ALU)); - CfCount++; - CFStack.pushBranch(AMDGPU::CF_PUSH_EG); - } else - CFStack.pushBranch(AMDGPU::CF_ALU_PUSH_BEFORE); - - case AMDGPU::CF_ALU: - I = MI; - AluClauses.push_back(MakeALUClause(MBB, I)); - DEBUG(dbgs() << CfCount << ":"; MI->dump();); - CfCount++; - break; - case AMDGPU::WHILELOOP: { - CFStack.pushLoop(); - MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI), - getHWInstrDesc(CF_WHILE_LOOP)) - .addImm(1); - std::pair > Pair(CfCount, - std::set()); - Pair.second.insert(MIb); - LoopStack.push_back(std::move(Pair)); - MI->eraseFromParent(); - CfCount++; - break; - } - case AMDGPU::ENDLOOP: { - CFStack.popLoop(); - std::pair > Pair = - std::move(LoopStack.back()); - LoopStack.pop_back(); - CounterPropagateAddr(Pair.second, CfCount); - BuildMI(MBB, MI, MBB.findDebugLoc(MI), getHWInstrDesc(CF_END_LOOP)) - .addImm(Pair.first + 1); - MI->eraseFromParent(); - CfCount++; - break; - } - case AMDGPU::IF_PREDICATE_SET: { - LastAlu.push_back(nullptr); - MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI), - getHWInstrDesc(CF_JUMP)) - .addImm(0) - .addImm(0); - IfThenElseStack.push_back(MIb); - DEBUG(dbgs() << CfCount << ":"; MIb->dump();); - MI->eraseFromParent(); - CfCount++; - break; - } - case AMDGPU::ELSE: { - MachineInstr * JumpInst = IfThenElseStack.back(); - IfThenElseStack.pop_back(); - CounterPropagateAddr(JumpInst, CfCount); - MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI), - getHWInstrDesc(CF_ELSE)) - .addImm(0) - .addImm(0); - DEBUG(dbgs() << CfCount << ":"; MIb->dump();); - IfThenElseStack.push_back(MIb); - MI->eraseFromParent(); - CfCount++; - break; - } - case AMDGPU::ENDIF: { - CFStack.popBranch(); - if (LastAlu.back()) { - ToPopAfter.push_back(LastAlu.back()); - } else { - MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI), - getHWInstrDesc(CF_POP)) - .addImm(CfCount + 1) - .addImm(1); - (void)MIb; - DEBUG(dbgs() << CfCount << ":"; MIb->dump();); - CfCount++; - } - - MachineInstr *IfOrElseInst = IfThenElseStack.back(); - IfThenElseStack.pop_back(); - CounterPropagateAddr(IfOrElseInst, CfCount); - IfOrElseInst->getOperand(1).setImm(1); - LastAlu.pop_back(); - MI->eraseFromParent(); - break; - } - case AMDGPU::BREAK: { - CfCount ++; - MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI), - getHWInstrDesc(CF_LOOP_BREAK)) - .addImm(0); - LoopStack.back().second.insert(MIb); - MI->eraseFromParent(); - break; - } - case AMDGPU::CONTINUE: { - MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI), - getHWInstrDesc(CF_LOOP_CONTINUE)) - .addImm(0); - LoopStack.back().second.insert(MIb); - MI->eraseFromParent(); - CfCount++; - break; - } - case AMDGPU::RETURN: { - BuildMI(MBB, MI, MBB.findDebugLoc(MI), getHWInstrDesc(CF_END)); - CfCount++; - MI->eraseFromParent(); - if (CfCount % 2) { - BuildMI(MBB, I, MBB.findDebugLoc(MI), TII->get(AMDGPU::PAD)); - CfCount++; - } - for (unsigned i = 0, e = FetchClauses.size(); i < e; i++) - EmitFetchClause(I, FetchClauses[i], CfCount); - for (unsigned i = 0, e = AluClauses.size(); i < e; i++) - EmitALUClause(I, AluClauses[i], CfCount); - } - default: - if (TII->isExport(MI->getOpcode())) { - DEBUG(dbgs() << CfCount << ":"; MI->dump();); - CfCount++; - } - break; - } - } - for (unsigned i = 0, e = ToPopAfter.size(); i < e; ++i) { - MachineInstr *Alu = ToPopAfter[i]; - BuildMI(MBB, Alu, MBB.findDebugLoc((MachineBasicBlock::iterator)Alu), - TII->get(AMDGPU::CF_ALU_POP_AFTER)) - .addImm(Alu->getOperand(0).getImm()) - .addImm(Alu->getOperand(1).getImm()) - .addImm(Alu->getOperand(2).getImm()) - .addImm(Alu->getOperand(3).getImm()) - .addImm(Alu->getOperand(4).getImm()) - .addImm(Alu->getOperand(5).getImm()) - .addImm(Alu->getOperand(6).getImm()) - .addImm(Alu->getOperand(7).getImm()) - .addImm(Alu->getOperand(8).getImm()); - Alu->eraseFromParent(); - } - MFI->StackSize = CFStack.MaxStackSize; - } - - return false; - } - - const char *getPassName() const override { - return "R600 Control Flow Finalizer Pass"; - } -}; - -char R600ControlFlowFinalizer::ID = 0; - -} // end anonymous namespace - - -llvm::FunctionPass *llvm::createR600ControlFlowFinalizer(TargetMachine &TM) { - return new R600ControlFlowFinalizer(TM); -} Index: projects/clang370-import/contrib/llvm/lib/Target/R600/AMDGPUFrameLowering.h =================================================================== --- projects/clang370-import/contrib/llvm/lib/Target/R600/AMDGPUFrameLowering.h (revision 288077) +++ projects/clang370-import/contrib/llvm/lib/Target/R600/AMDGPUFrameLowering.h (nonexistent) @@ -1,45 +0,0 @@ -//===--------------------- AMDGPUFrameLowering.h ----------------*- C++ -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -/// \file -/// \brief Interface to describe a layout of a stack frame on a AMDIL target -/// machine. -// -//===----------------------------------------------------------------------===// -#ifndef LLVM_LIB_TARGET_R600_AMDGPUFRAMELOWERING_H -#define LLVM_LIB_TARGET_R600_AMDGPUFRAMELOWERING_H - -#include "llvm/CodeGen/MachineFunction.h" -#include "llvm/Target/TargetFrameLowering.h" - -namespace llvm { - -/// \brief Information about the stack frame layout on the AMDGPU targets. -/// -/// It holds the direction of the stack growth, the known stack alignment on -/// entry to each function, and the offset to the locals area. -/// See TargetFrameInfo for more comments. -class AMDGPUFrameLowering : public TargetFrameLowering { -public: - AMDGPUFrameLowering(StackDirection D, unsigned StackAl, int LAO, - unsigned TransAl = 1); - virtual ~AMDGPUFrameLowering(); - - /// \returns The number of 32-bit sub-registers that are used when storing - /// values to the stack. - unsigned getStackWidth(const MachineFunction &MF) const; - int getFrameIndexOffset(const MachineFunction &MF, int FI) const override; - const SpillSlot * - getCalleeSavedSpillSlots(unsigned &NumEntries) const override; - void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override; - void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override; - bool hasFP(const MachineFunction &MF) const override; -}; -} // namespace llvm -#endif Index: projects/clang370-import/contrib/llvm/lib/Target/R600/SIFixSGPRLiveRanges.cpp =================================================================== --- projects/clang370-import/contrib/llvm/lib/Target/R600/SIFixSGPRLiveRanges.cpp (revision 288077) +++ projects/clang370-import/contrib/llvm/lib/Target/R600/SIFixSGPRLiveRanges.cpp (nonexistent) @@ -1,192 +0,0 @@ -//===-- SIFixSGPRLiveRanges.cpp - Fix SGPR live ranges ----------------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -/// \file -/// SALU instructions ignore control flow, so we need to modify the live ranges -/// of the registers they define in some cases. -/// -/// The main case we need to handle is when a def is used in one side of a -/// branch and not another. For example: -/// -/// %def -/// IF -/// ... -/// ... -/// ELSE -/// %use -/// ... -/// ENDIF -/// -/// Here we need the register allocator to avoid assigning any of the defs -/// inside of the IF to the same register as %def. In traditional live -/// interval analysis %def is not live inside the IF branch, however, since -/// SALU instructions inside of IF will be executed even if the branch is not -/// taken, there is the chance that one of the instructions will overwrite the -/// value of %def, so the use in ELSE will see the wrong value. -/// -/// The strategy we use for solving this is to add an extra use after the ENDIF: -/// -/// %def -/// IF -/// ... -/// ... -/// ELSE -/// %use -/// ... -/// ENDIF -/// %use -/// -/// Adding this use will make the def live thoughout the IF branch, which is -/// what we want. - -#include "AMDGPU.h" -#include "SIInstrInfo.h" -#include "SIRegisterInfo.h" -#include "llvm/CodeGen/LiveIntervalAnalysis.h" -#include "llvm/CodeGen/MachineFunctionPass.h" -#include "llvm/CodeGen/MachineInstrBuilder.h" -#include "llvm/CodeGen/MachinePostDominators.h" -#include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/Support/Debug.h" -#include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetMachine.h" - -using namespace llvm; - -#define DEBUG_TYPE "si-fix-sgpr-live-ranges" - -namespace { - -class SIFixSGPRLiveRanges : public MachineFunctionPass { -public: - static char ID; - -public: - SIFixSGPRLiveRanges() : MachineFunctionPass(ID) { - initializeSIFixSGPRLiveRangesPass(*PassRegistry::getPassRegistry()); - } - - bool runOnMachineFunction(MachineFunction &MF) override; - - const char *getPassName() const override { - return "SI Fix SGPR live ranges"; - } - - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired(); - AU.addRequired(); - AU.setPreservesCFG(); - MachineFunctionPass::getAnalysisUsage(AU); - } -}; - -} // End anonymous namespace. - -INITIALIZE_PASS_BEGIN(SIFixSGPRLiveRanges, DEBUG_TYPE, - "SI Fix SGPR Live Ranges", false, false) -INITIALIZE_PASS_DEPENDENCY(LiveIntervals) -INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree) -INITIALIZE_PASS_END(SIFixSGPRLiveRanges, DEBUG_TYPE, - "SI Fix SGPR Live Ranges", false, false) - -char SIFixSGPRLiveRanges::ID = 0; - -char &llvm::SIFixSGPRLiveRangesID = SIFixSGPRLiveRanges::ID; - -FunctionPass *llvm::createSIFixSGPRLiveRangesPass() { - return new SIFixSGPRLiveRanges(); -} - -bool SIFixSGPRLiveRanges::runOnMachineFunction(MachineFunction &MF) { - MachineRegisterInfo &MRI = MF.getRegInfo(); - const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo(); - const SIRegisterInfo *TRI = static_cast( - MF.getSubtarget().getRegisterInfo()); - LiveIntervals *LIS = &getAnalysis(); - MachinePostDominatorTree *PDT = &getAnalysis(); - std::vector> SGPRLiveRanges; - - // First pass, collect all live intervals for SGPRs - for (const MachineBasicBlock &MBB : MF) { - for (const MachineInstr &MI : MBB) { - for (const MachineOperand &MO : MI.defs()) { - if (MO.isImplicit()) - continue; - unsigned Def = MO.getReg(); - if (TargetRegisterInfo::isVirtualRegister(Def)) { - if (TRI->isSGPRClass(MRI.getRegClass(Def))) - SGPRLiveRanges.push_back( - std::make_pair(Def, &LIS->getInterval(Def))); - } else if (TRI->isSGPRClass(TRI->getPhysRegClass(Def))) { - SGPRLiveRanges.push_back( - std::make_pair(Def, &LIS->getRegUnit(Def))); - } - } - } - } - - // Second pass fix the intervals - for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); - BI != BE; ++BI) { - MachineBasicBlock &MBB = *BI; - if (MBB.succ_size() < 2) - continue; - - // We have structured control flow, so number of succesors should be two. - assert(MBB.succ_size() == 2); - MachineBasicBlock *SuccA = *MBB.succ_begin(); - MachineBasicBlock *SuccB = *(++MBB.succ_begin()); - MachineBasicBlock *NCD = PDT->findNearestCommonDominator(SuccA, SuccB); - - if (!NCD) - continue; - - MachineBasicBlock::iterator NCDTerm = NCD->getFirstTerminator(); - - if (NCDTerm != NCD->end() && NCDTerm->getOpcode() == AMDGPU::SI_ELSE) { - assert(NCD->succ_size() == 2); - // We want to make sure we insert the Use after the ENDIF, not after - // the ELSE. - NCD = PDT->findNearestCommonDominator(*NCD->succ_begin(), - *(++NCD->succ_begin())); - } - assert(SuccA && SuccB); - for (std::pair RegLR : SGPRLiveRanges) { - unsigned Reg = RegLR.first; - LiveRange *LR = RegLR.second; - - // FIXME: We could be smarter here. If the register is Live-In to - // one block, but the other doesn't have any SGPR defs, then there - // won't be a conflict. Also, if the branch decision is based on - // a value in an SGPR, then there will be no conflict. - bool LiveInToA = LIS->isLiveInToMBB(*LR, SuccA); - bool LiveInToB = LIS->isLiveInToMBB(*LR, SuccB); - - if ((!LiveInToA && !LiveInToB) || - (LiveInToA && LiveInToB)) - continue; - - // This interval is live in to one successor, but not the other, so - // we need to update its range so it is live in to both. - DEBUG(dbgs() << "Possible SGPR conflict detected " << " in " << *LR << - " BB#" << SuccA->getNumber() << ", BB#" << - SuccB->getNumber() << - " with NCD = " << NCD->getNumber() << '\n'); - - // FIXME: Need to figure out how to update LiveRange here so this pass - // will be able to preserve LiveInterval analysis. - BuildMI(*NCD, NCD->getFirstNonPHI(), DebugLoc(), - TII->get(AMDGPU::SGPR_USE)) - .addReg(Reg, RegState::Implicit); - DEBUG(NCD->getFirstNonPHI()->dump()); - } - } - - return false; -} Index: projects/clang370-import/contrib/llvm/lib/Target/R600/SIShrinkInstructions.cpp =================================================================== --- projects/clang370-import/contrib/llvm/lib/Target/R600/SIShrinkInstructions.cpp (revision 288077) +++ projects/clang370-import/contrib/llvm/lib/Target/R600/SIShrinkInstructions.cpp (nonexistent) @@ -1,272 +0,0 @@ -//===-- SIShrinkInstructions.cpp - Shrink Instructions --------------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -/// The pass tries to use the 32-bit encoding for instructions when possible. -//===----------------------------------------------------------------------===// -// - -#include "AMDGPU.h" -#include "AMDGPUMCInstLower.h" -#include "AMDGPUSubtarget.h" -#include "SIInstrInfo.h" -#include "llvm/ADT/Statistic.h" -#include "llvm/CodeGen/MachineFunctionPass.h" -#include "llvm/CodeGen/MachineInstrBuilder.h" -#include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/IR/Constants.h" -#include "llvm/IR/Function.h" -#include "llvm/IR/LLVMContext.h" -#include "llvm/Support/Debug.h" -#include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetMachine.h" - -#define DEBUG_TYPE "si-shrink-instructions" - -STATISTIC(NumInstructionsShrunk, - "Number of 64-bit instruction reduced to 32-bit."); -STATISTIC(NumLiteralConstantsFolded, - "Number of literal constants folded into 32-bit instructions."); - -namespace llvm { - void initializeSIShrinkInstructionsPass(PassRegistry&); -} - -using namespace llvm; - -namespace { - -class SIShrinkInstructions : public MachineFunctionPass { -public: - static char ID; - -public: - SIShrinkInstructions() : MachineFunctionPass(ID) { - } - - bool runOnMachineFunction(MachineFunction &MF) override; - - const char *getPassName() const override { - return "SI Shrink Instructions"; - } - - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.setPreservesCFG(); - MachineFunctionPass::getAnalysisUsage(AU); - } -}; - -} // End anonymous namespace. - -INITIALIZE_PASS_BEGIN(SIShrinkInstructions, DEBUG_TYPE, - "SI Lower il Copies", false, false) -INITIALIZE_PASS_END(SIShrinkInstructions, DEBUG_TYPE, - "SI Lower il Copies", false, false) - -char SIShrinkInstructions::ID = 0; - -FunctionPass *llvm::createSIShrinkInstructionsPass() { - return new SIShrinkInstructions(); -} - -static bool isVGPR(const MachineOperand *MO, const SIRegisterInfo &TRI, - const MachineRegisterInfo &MRI) { - if (!MO->isReg()) - return false; - - if (TargetRegisterInfo::isVirtualRegister(MO->getReg())) - return TRI.hasVGPRs(MRI.getRegClass(MO->getReg())); - - return TRI.hasVGPRs(TRI.getPhysRegClass(MO->getReg())); -} - -static bool canShrink(MachineInstr &MI, const SIInstrInfo *TII, - const SIRegisterInfo &TRI, - const MachineRegisterInfo &MRI) { - - const MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2); - // Can't shrink instruction with three operands. - // FIXME: v_cndmask_b32 has 3 operands and is shrinkable, but we need to add - // a special case for it. It can only be shrunk if the third operand - // is vcc. We should handle this the same way we handle vopc, by addding - // a register allocation hint pre-regalloc and then do the shrining - // post-regalloc. - if (Src2) - return false; - - const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); - const MachineOperand *Src1Mod = - TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers); - - if (Src1 && (!isVGPR(Src1, TRI, MRI) || (Src1Mod && Src1Mod->getImm() != 0))) - return false; - - // We don't need to check src0, all input types are legal, so just make sure - // src0 isn't using any modifiers. - if (TII->hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers)) - return false; - - // Check output modifiers - if (TII->hasModifiersSet(MI, AMDGPU::OpName::omod)) - return false; - - if (TII->hasModifiersSet(MI, AMDGPU::OpName::clamp)) - return false; - - return true; -} - -/// \brief This function checks \p MI for operands defined by a move immediate -/// instruction and then folds the literal constant into the instruction if it -/// can. This function assumes that \p MI is a VOP1, VOP2, or VOPC instruction -/// and will only fold literal constants if we are still in SSA. -static void foldImmediates(MachineInstr &MI, const SIInstrInfo *TII, - MachineRegisterInfo &MRI, bool TryToCommute = true) { - - if (!MRI.isSSA()) - return; - - assert(TII->isVOP1(MI.getOpcode()) || TII->isVOP2(MI.getOpcode()) || - TII->isVOPC(MI.getOpcode())); - - const SIRegisterInfo &TRI = TII->getRegisterInfo(); - int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0); - MachineOperand &Src0 = MI.getOperand(Src0Idx); - - // Only one literal constant is allowed per instruction, so if src0 is a - // literal constant then we can't do any folding. - if (Src0.isImm() && - TII->isLiteralConstant(Src0, TII->getOpSize(MI, Src0Idx))) - return; - - // Literal constants and SGPRs can only be used in Src0, so if Src0 is an - // SGPR, we cannot commute the instruction, so we can't fold any literal - // constants. - if (Src0.isReg() && !isVGPR(&Src0, TRI, MRI)) - return; - - // Try to fold Src0 - if (Src0.isReg()) { - unsigned Reg = Src0.getReg(); - MachineInstr *Def = MRI.getUniqueVRegDef(Reg); - if (Def && Def->isMoveImmediate()) { - MachineOperand &MovSrc = Def->getOperand(1); - bool ConstantFolded = false; - - if (MovSrc.isImm() && isUInt<32>(MovSrc.getImm())) { - Src0.ChangeToImmediate(MovSrc.getImm()); - ConstantFolded = true; - } - if (ConstantFolded) { - if (MRI.use_empty(Reg)) - Def->eraseFromParent(); - ++NumLiteralConstantsFolded; - return; - } - } - } - - // We have failed to fold src0, so commute the instruction and try again. - if (TryToCommute && MI.isCommutable() && TII->commuteInstruction(&MI)) - foldImmediates(MI, TII, MRI, false); - -} - -bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) { - MachineRegisterInfo &MRI = MF.getRegInfo(); - const SIInstrInfo *TII = - static_cast(MF.getSubtarget().getInstrInfo()); - const SIRegisterInfo &TRI = TII->getRegisterInfo(); - std::vector I1Defs; - - for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); - BI != BE; ++BI) { - - MachineBasicBlock &MBB = *BI; - MachineBasicBlock::iterator I, Next; - for (I = MBB.begin(); I != MBB.end(); I = Next) { - Next = std::next(I); - MachineInstr &MI = *I; - - // Try to use S_MOVK_I32, which will save 4 bytes for small immediates. - if (MI.getOpcode() == AMDGPU::S_MOV_B32) { - const MachineOperand &Src = MI.getOperand(1); - - if (Src.isImm()) { - if (isInt<16>(Src.getImm()) && !TII->isInlineConstant(Src, 4)) - MI.setDesc(TII->get(AMDGPU::S_MOVK_I32)); - } - - continue; - } - - if (!TII->hasVALU32BitEncoding(MI.getOpcode())) - continue; - - if (!canShrink(MI, TII, TRI, MRI)) { - // Try commuting the instruction and see if that enables us to shrink - // it. - if (!MI.isCommutable() || !TII->commuteInstruction(&MI) || - !canShrink(MI, TII, TRI, MRI)) - continue; - } - - // getVOPe32 could be -1 here if we started with an instruction that had - // a 32-bit encoding and then commuted it to an instruction that did not. - if (!TII->hasVALU32BitEncoding(MI.getOpcode())) - continue; - - int Op32 = AMDGPU::getVOPe32(MI.getOpcode()); - - if (TII->isVOPC(Op32)) { - unsigned DstReg = MI.getOperand(0).getReg(); - if (TargetRegisterInfo::isVirtualRegister(DstReg)) { - // VOPC instructions can only write to the VCC register. We can't - // force them to use VCC here, because the register allocator has - // trouble with sequences like this, which cause the allocator to run - // out of registers if vreg0 and vreg1 belong to the VCCReg register - // class: - // vreg0 = VOPC; - // vreg1 = VOPC; - // S_AND_B64 vreg0, vreg1 - // - // So, instead of forcing the instruction to write to VCC, we provide - // a hint to the register allocator to use VCC and then we we will run - // this pass again after RA and shrink it if it outputs to VCC. - MRI.setRegAllocationHint(MI.getOperand(0).getReg(), 0, AMDGPU::VCC); - continue; - } - if (DstReg != AMDGPU::VCC) - continue; - } - - // We can shrink this instruction - DEBUG(dbgs() << "Shrinking "; MI.dump(); dbgs() << '\n';); - - MachineInstrBuilder Inst32 = - BuildMI(MBB, I, MI.getDebugLoc(), TII->get(Op32)); - - // dst - Inst32.addOperand(MI.getOperand(0)); - - Inst32.addOperand(*TII->getNamedOperand(MI, AMDGPU::OpName::src0)); - - const MachineOperand *Src1 = - TII->getNamedOperand(MI, AMDGPU::OpName::src1); - if (Src1) - Inst32.addOperand(*Src1); - - ++NumInstructionsShrunk; - MI.eraseFromParent(); - - foldImmediates(*Inst32, TII, MRI); - DEBUG(dbgs() << "e32 MI = " << *Inst32 << '\n'); - - - } - } - return false; -} Index: projects/clang370-import/contrib/llvm/lib/Target/R600/Processors.td =================================================================== --- projects/clang370-import/contrib/llvm/lib/Target/R600/Processors.td (revision 288077) +++ projects/clang370-import/contrib/llvm/lib/Target/R600/Processors.td (nonexistent) @@ -1,137 +0,0 @@ -//===-- Processors.td - R600 Processor definitions ------------------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// - -class Proc Features> -: Processor; - -//===----------------------------------------------------------------------===// -// R600 -//===----------------------------------------------------------------------===// -def : Proc<"", R600_VLIW5_Itin, - [FeatureR600, FeatureVertexCache]>; - -def : Proc<"r600", R600_VLIW5_Itin, - [FeatureR600 , FeatureVertexCache, FeatureWavefrontSize64]>; - -def : Proc<"r630", R600_VLIW5_Itin, - [FeatureR600, FeatureVertexCache, FeatureWavefrontSize32]>; - -def : Proc<"rs880", R600_VLIW5_Itin, - [FeatureR600, FeatureWavefrontSize16]>; - -def : Proc<"rv670", R600_VLIW5_Itin, - [FeatureR600, FeatureFP64, FeatureVertexCache, FeatureWavefrontSize64]>; - -//===----------------------------------------------------------------------===// -// R700 -//===----------------------------------------------------------------------===// - -def : Proc<"rv710", R600_VLIW5_Itin, - [FeatureR700, FeatureVertexCache, FeatureWavefrontSize32]>; - -def : Proc<"rv730", R600_VLIW5_Itin, - [FeatureR700, FeatureVertexCache, FeatureWavefrontSize32]>; - -def : Proc<"rv770", R600_VLIW5_Itin, - [FeatureR700, FeatureFP64, FeatureVertexCache, FeatureWavefrontSize64]>; - -//===----------------------------------------------------------------------===// -// Evergreen -//===----------------------------------------------------------------------===// - -def : Proc<"cedar", R600_VLIW5_Itin, - [FeatureEvergreen, FeatureVertexCache, FeatureWavefrontSize32, - FeatureCFALUBug]>; - -def : Proc<"redwood", R600_VLIW5_Itin, - [FeatureEvergreen, FeatureVertexCache, FeatureWavefrontSize64, - FeatureCFALUBug]>; - -def : Proc<"sumo", R600_VLIW5_Itin, - [FeatureEvergreen, FeatureWavefrontSize64, FeatureCFALUBug]>; - -def : Proc<"juniper", R600_VLIW5_Itin, - [FeatureEvergreen, FeatureVertexCache, FeatureWavefrontSize64]>; - -def : Proc<"cypress", R600_VLIW5_Itin, - [FeatureEvergreen, FeatureFP64, FeatureVertexCache, - FeatureWavefrontSize64]>; - -//===----------------------------------------------------------------------===// -// Northern Islands -//===----------------------------------------------------------------------===// - -def : Proc<"barts", R600_VLIW5_Itin, - [FeatureNorthernIslands, FeatureVertexCache, FeatureCFALUBug]>; - -def : Proc<"turks", R600_VLIW5_Itin, - [FeatureNorthernIslands, FeatureVertexCache, FeatureCFALUBug]>; - -def : Proc<"caicos", R600_VLIW5_Itin, - [FeatureNorthernIslands, FeatureCFALUBug]>; - -def : Proc<"cayman", R600_VLIW4_Itin, - [FeatureNorthernIslands, FeatureFP64, FeatureCaymanISA]>; - -//===----------------------------------------------------------------------===// -// Southern Islands -//===----------------------------------------------------------------------===// - -def : ProcessorModel<"SI", SIFullSpeedModel, - [FeatureSouthernIslands, FeatureFastFMAF32] ->; - -def : ProcessorModel<"tahiti", SIFullSpeedModel, - [FeatureSouthernIslands, FeatureFastFMAF32] ->; - -def : ProcessorModel<"pitcairn", SIQuarterSpeedModel, [FeatureSouthernIslands]>; - -def : ProcessorModel<"verde", SIQuarterSpeedModel, [FeatureSouthernIslands]>; - -def : ProcessorModel<"oland", SIQuarterSpeedModel, [FeatureSouthernIslands]>; - -def : ProcessorModel<"hainan", SIQuarterSpeedModel, [FeatureSouthernIslands]>; - -//===----------------------------------------------------------------------===// -// Sea Islands -//===----------------------------------------------------------------------===// - -def : ProcessorModel<"bonaire", SIQuarterSpeedModel, - [FeatureSeaIslands, FeatureLDSBankCount32] ->; - -def : ProcessorModel<"kabini", SIQuarterSpeedModel, - [FeatureSeaIslands, FeatureLDSBankCount16] ->; - -def : ProcessorModel<"kaveri", SIQuarterSpeedModel, - [FeatureSeaIslands, FeatureLDSBankCount32] ->; - -def : ProcessorModel<"hawaii", SIFullSpeedModel, - [FeatureSeaIslands, FeatureFastFMAF32, FeatureLDSBankCount32] ->; - -def : ProcessorModel<"mullins", SIQuarterSpeedModel, - [FeatureSeaIslands, FeatureLDSBankCount16]>; - -//===----------------------------------------------------------------------===// -// Volcanic Islands -//===----------------------------------------------------------------------===// - -def : ProcessorModel<"tonga", SIQuarterSpeedModel, - [FeatureVolcanicIslands, FeatureSGPRInitBug] ->; - -def : ProcessorModel<"iceland", SIQuarterSpeedModel, - [FeatureVolcanicIslands, FeatureSGPRInitBug] ->; - -def : ProcessorModel<"carrizo", SIQuarterSpeedModel, [FeatureVolcanicIslands]>; Index: projects/clang370-import/contrib/llvm/lib/Target/R600/R600MachineScheduler.h =================================================================== --- projects/clang370-import/contrib/llvm/lib/Target/R600/R600MachineScheduler.h (revision 288077) +++ projects/clang370-import/contrib/llvm/lib/Target/R600/R600MachineScheduler.h (nonexistent) @@ -1,103 +0,0 @@ -//===-- R600MachineScheduler.h - R600 Scheduler Interface -*- C++ -*-------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -/// \file -/// \brief R600 Machine Scheduler interface -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIB_TARGET_R600_R600MACHINESCHEDULER_H -#define LLVM_LIB_TARGET_R600_R600MACHINESCHEDULER_H - -#include "R600InstrInfo.h" -#include "llvm/ADT/PriorityQueue.h" -#include "llvm/CodeGen/MachineScheduler.h" -#include "llvm/Support/Debug.h" - -using namespace llvm; - -namespace llvm { - -class R600SchedStrategy : public MachineSchedStrategy { - - const ScheduleDAGMILive *DAG; - const R600InstrInfo *TII; - const R600RegisterInfo *TRI; - MachineRegisterInfo *MRI; - - enum InstKind { - IDAlu, - IDFetch, - IDOther, - IDLast - }; - - enum AluKind { - AluAny, - AluT_X, - AluT_Y, - AluT_Z, - AluT_W, - AluT_XYZW, - AluPredX, - AluTrans, - AluDiscarded, // LLVM Instructions that are going to be eliminated - AluLast - }; - - std::vector Available[IDLast], Pending[IDLast]; - std::vector AvailableAlus[AluLast]; - std::vector PhysicalRegCopy; - - InstKind CurInstKind; - int CurEmitted; - InstKind NextInstKind; - - unsigned AluInstCount; - unsigned FetchInstCount; - - int InstKindLimit[IDLast]; - - int OccupedSlotsMask; - -public: - R600SchedStrategy() : - DAG(nullptr), TII(nullptr), TRI(nullptr), MRI(nullptr) { - } - - virtual ~R600SchedStrategy() {} - - void initialize(ScheduleDAGMI *dag) override; - SUnit *pickNode(bool &IsTopNode) override; - void schedNode(SUnit *SU, bool IsTopNode) override; - void releaseTopNode(SUnit *SU) override; - void releaseBottomNode(SUnit *SU) override; - -private: - std::vector InstructionsGroupCandidate; - bool VLIW5; - - int getInstKind(SUnit *SU); - bool regBelongsToClass(unsigned Reg, const TargetRegisterClass *RC) const; - AluKind getAluKind(SUnit *SU) const; - void LoadAlu(); - unsigned AvailablesAluCount() const; - SUnit *AttemptFillSlot (unsigned Slot, bool AnyAlu); - void PrepareNextSlot(); - SUnit *PopInst(std::vector &Q, bool AnyALU); - - void AssignSlot(MachineInstr *MI, unsigned Slot); - SUnit* pickAlu(); - SUnit* pickOther(int QID); - void MoveUnits(std::vector &QSrc, std::vector &QDst); -}; - -} // namespace llvm - -#endif /* R600MACHINESCHEDULER_H_ */ Index: projects/clang370-import/contrib/llvm/lib/Target/R600/R600Instructions.td =================================================================== --- projects/clang370-import/contrib/llvm/lib/Target/R600/R600Instructions.td (revision 288077) +++ projects/clang370-import/contrib/llvm/lib/Target/R600/R600Instructions.td (nonexistent) @@ -1,1744 +0,0 @@ -//===-- R600Instructions.td - R600 Instruction defs -------*- tablegen -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// TableGen definitions for instructions which are available on R600 family -// GPUs. -// -//===----------------------------------------------------------------------===// - -include "R600Intrinsics.td" -include "R600InstrFormats.td" - -class InstR600ISA pattern> : - InstR600 { - - let Namespace = "AMDGPU"; -} - -def MEMxi : Operand { - let MIOperandInfo = (ops R600_TReg32_X:$ptr, i32imm:$index); - let PrintMethod = "printMemOperand"; -} - -def MEMrr : Operand { - let MIOperandInfo = (ops R600_Reg32:$ptr, R600_Reg32:$index); -} - -// Operands for non-registers - -class InstFlag - : OperandWithDefaultOps { - let PrintMethod = PM; -} - -// src_sel for ALU src operands, see also ALU_CONST, ALU_PARAM registers -def SEL : OperandWithDefaultOps { - let PrintMethod = "printSel"; -} -def BANK_SWIZZLE : OperandWithDefaultOps { - let PrintMethod = "printBankSwizzle"; -} - -def LITERAL : InstFlag<"printLiteral">; - -def WRITE : InstFlag <"printWrite", 1>; -def OMOD : InstFlag <"printOMOD">; -def REL : InstFlag <"printRel">; -def CLAMP : InstFlag <"printClamp">; -def NEG : InstFlag <"printNeg">; -def ABS : InstFlag <"printAbs">; -def UEM : InstFlag <"printUpdateExecMask">; -def UP : InstFlag <"printUpdatePred">; - -// XXX: The r600g finalizer in Mesa expects last to be one in most cases. -// Once we start using the packetizer in this backend we should have this -// default to 0. -def LAST : InstFlag<"printLast", 1>; -def RSel : Operand { - let PrintMethod = "printRSel"; -} -def CT: Operand { - let PrintMethod = "printCT"; -} - -def FRAMEri : Operand { - let MIOperandInfo = (ops R600_Reg32:$ptr, i32imm:$index); -} - -def ADDRParam : ComplexPattern; -def ADDRDWord : ComplexPattern; -def ADDRVTX_READ : ComplexPattern; -def ADDRGA_CONST_OFFSET : ComplexPattern; -def ADDRGA_VAR_OFFSET : ComplexPattern; - - -def R600_Pred : PredicateOperand; - - -let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in { - -// Class for instructions with only one source register. -// If you add new ins to this instruction, make sure they are listed before -// $literal, because the backend currently assumes that the last operand is -// a literal. Also be sure to update the enum R600Op1OperandIndex::ROI in -// R600Defines.h, R600InstrInfo::buildDefaultInstruction(), -// and R600InstrInfo::getOperandIdx(). -class R600_1OP inst, string opName, list pattern, - InstrItinClass itin = AnyALU> : - InstR600 <(outs R600_Reg32:$dst), - (ins WRITE:$write, OMOD:$omod, REL:$dst_rel, CLAMP:$clamp, - R600_Reg32:$src0, NEG:$src0_neg, REL:$src0_rel, ABS:$src0_abs, SEL:$src0_sel, - LAST:$last, R600_Pred:$pred_sel, LITERAL:$literal, - BANK_SWIZZLE:$bank_swizzle), - !strconcat(" ", opName, - "$clamp $last $dst$write$dst_rel$omod, " - "$src0_neg$src0_abs$src0$src0_abs$src0_rel, " - "$pred_sel $bank_swizzle"), - pattern, - itin>, - R600ALU_Word0, - R600ALU_Word1_OP2 { - - let src1 = 0; - let src1_rel = 0; - let src1_neg = 0; - let src1_abs = 0; - let update_exec_mask = 0; - let update_pred = 0; - let HasNativeOperands = 1; - let Op1 = 1; - let ALUInst = 1; - let DisableEncoding = "$literal"; - let UseNamedOperandTable = 1; - - let Inst{31-0} = Word0; - let Inst{63-32} = Word1; -} - -class R600_1OP_Helper inst, string opName, SDPatternOperator node, - InstrItinClass itin = AnyALU> : - R600_1OP ; - -// If you add or change the operands for R600_2OP instructions, you must -// also update the R600Op2OperandIndex::ROI enum in R600Defines.h, -// R600InstrInfo::buildDefaultInstruction(), and R600InstrInfo::getOperandIdx(). -class R600_2OP inst, string opName, list pattern, - InstrItinClass itin = AnyALU> : - InstR600 <(outs R600_Reg32:$dst), - (ins UEM:$update_exec_mask, UP:$update_pred, WRITE:$write, - OMOD:$omod, REL:$dst_rel, CLAMP:$clamp, - R600_Reg32:$src0, NEG:$src0_neg, REL:$src0_rel, ABS:$src0_abs, SEL:$src0_sel, - R600_Reg32:$src1, NEG:$src1_neg, REL:$src1_rel, ABS:$src1_abs, SEL:$src1_sel, - LAST:$last, R600_Pred:$pred_sel, LITERAL:$literal, - BANK_SWIZZLE:$bank_swizzle), - !strconcat(" ", opName, - "$clamp $last $update_exec_mask$update_pred$dst$write$dst_rel$omod, " - "$src0_neg$src0_abs$src0$src0_abs$src0_rel, " - "$src1_neg$src1_abs$src1$src1_abs$src1_rel, " - "$pred_sel $bank_swizzle"), - pattern, - itin>, - R600ALU_Word0, - R600ALU_Word1_OP2 { - - let HasNativeOperands = 1; - let Op2 = 1; - let ALUInst = 1; - let DisableEncoding = "$literal"; - let UseNamedOperandTable = 1; - - let Inst{31-0} = Word0; - let Inst{63-32} = Word1; -} - -class R600_2OP_Helper inst, string opName, SDPatternOperator node, - InstrItinClass itin = AnyALU> : - R600_2OP ; - -// If you add our change the operands for R600_3OP instructions, you must -// also update the R600Op3OperandIndex::ROI enum in R600Defines.h, -// R600InstrInfo::buildDefaultInstruction(), and -// R600InstrInfo::getOperandIdx(). -class R600_3OP inst, string opName, list pattern, - InstrItinClass itin = AnyALU> : - InstR600 <(outs R600_Reg32:$dst), - (ins REL:$dst_rel, CLAMP:$clamp, - R600_Reg32:$src0, NEG:$src0_neg, REL:$src0_rel, SEL:$src0_sel, - R600_Reg32:$src1, NEG:$src1_neg, REL:$src1_rel, SEL:$src1_sel, - R600_Reg32:$src2, NEG:$src2_neg, REL:$src2_rel, SEL:$src2_sel, - LAST:$last, R600_Pred:$pred_sel, LITERAL:$literal, - BANK_SWIZZLE:$bank_swizzle), - !strconcat(" ", opName, "$clamp $last $dst$dst_rel, " - "$src0_neg$src0$src0_rel, " - "$src1_neg$src1$src1_rel, " - "$src2_neg$src2$src2_rel, " - "$pred_sel" - "$bank_swizzle"), - pattern, - itin>, - R600ALU_Word0, - R600ALU_Word1_OP3{ - - let HasNativeOperands = 1; - let DisableEncoding = "$literal"; - let Op3 = 1; - let UseNamedOperandTable = 1; - let ALUInst = 1; - - let Inst{31-0} = Word0; - let Inst{63-32} = Word1; -} - -class R600_REDUCTION inst, dag ins, string asm, list pattern, - InstrItinClass itin = VecALU> : - InstR600 <(outs R600_Reg32:$dst), - ins, - asm, - pattern, - itin>; - - - -} // End mayLoad = 1, mayStore = 0, hasSideEffects = 0 - -def TEX_SHADOW : PatLeaf< - (imm), - [{uint32_t TType = (uint32_t)N->getZExtValue(); - return (TType >= 6 && TType <= 8) || TType == 13; - }] ->; - -def TEX_RECT : PatLeaf< - (imm), - [{uint32_t TType = (uint32_t)N->getZExtValue(); - return TType == 5; - }] ->; - -def TEX_ARRAY : PatLeaf< - (imm), - [{uint32_t TType = (uint32_t)N->getZExtValue(); - return TType == 9 || TType == 10 || TType == 16; - }] ->; - -def TEX_SHADOW_ARRAY : PatLeaf< - (imm), - [{uint32_t TType = (uint32_t)N->getZExtValue(); - return TType == 11 || TType == 12 || TType == 17; - }] ->; - -def TEX_MSAA : PatLeaf< - (imm), - [{uint32_t TType = (uint32_t)N->getZExtValue(); - return TType == 14; - }] ->; - -def TEX_ARRAY_MSAA : PatLeaf< - (imm), - [{uint32_t TType = (uint32_t)N->getZExtValue(); - return TType == 15; - }] ->; - -class EG_CF_RAT cfinst, bits <6> ratinst, bits<4> ratid, bits<4> mask, - dag outs, dag ins, string asm, list pattern> : - InstR600ISA , - CF_ALLOC_EXPORT_WORD0_RAT, CF_ALLOC_EXPORT_WORD1_BUF { - - let rat_id = ratid; - let rat_inst = ratinst; - let rim = 0; - // XXX: Have a separate instruction for non-indexed writes. - let type = 1; - let rw_rel = 0; - let elem_size = 0; - - let array_size = 0; - let comp_mask = mask; - let burst_count = 0; - let vpm = 0; - let cf_inst = cfinst; - let mark = 0; - let barrier = 1; - - let Inst{31-0} = Word0; - let Inst{63-32} = Word1; - let IsExport = 1; - -} - -class VTX_READ buffer_id, dag outs, list pattern> - : InstR600ISA , - VTX_WORD1_GPR { - - // Static fields - let DST_REL = 0; - // The docs say that if this bit is set, then DATA_FORMAT, NUM_FORMAT_ALL, - // FORMAT_COMP_ALL, SRF_MODE_ALL, and ENDIAN_SWAP fields will be ignored, - // however, based on my testing if USE_CONST_FIELDS is set, then all - // these fields need to be set to 0. - let USE_CONST_FIELDS = 0; - let NUM_FORMAT_ALL = 1; - let FORMAT_COMP_ALL = 0; - let SRF_MODE_ALL = 0; - - let Inst{63-32} = Word1; - // LLVM can only encode 64-bit instructions, so these fields are manually - // encoded in R600CodeEmitter - // - // bits<16> OFFSET; - // bits<2> ENDIAN_SWAP = 0; - // bits<1> CONST_BUF_NO_STRIDE = 0; - // bits<1> MEGA_FETCH = 0; - // bits<1> ALT_CONST = 0; - // bits<2> BUFFER_INDEX_MODE = 0; - - // VTX_WORD2 (LLVM can only encode 64-bit instructions, so WORD2 encoding - // is done in R600CodeEmitter - // - // Inst{79-64} = OFFSET; - // Inst{81-80} = ENDIAN_SWAP; - // Inst{82} = CONST_BUF_NO_STRIDE; - // Inst{83} = MEGA_FETCH; - // Inst{84} = ALT_CONST; - // Inst{86-85} = BUFFER_INDEX_MODE; - // Inst{95-86} = 0; Reserved - - // VTX_WORD3 (Padding) - // - // Inst{127-96} = 0; - - let VTXInst = 1; -} - -class LoadParamFrag : PatFrag < - (ops node:$ptr), (load_type node:$ptr), - [{ return isConstantLoad(dyn_cast(N), 0); }] ->; - -def load_param : LoadParamFrag; -def load_param_exti8 : LoadParamFrag; -def load_param_exti16 : LoadParamFrag; - -def isR600 : Predicate<"Subtarget->getGeneration() <= AMDGPUSubtarget::R700">; - -def isR600toCayman - : Predicate< - "Subtarget->getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS">; - -//===----------------------------------------------------------------------===// -// R600 SDNodes -//===----------------------------------------------------------------------===// - -def INTERP_PAIR_XY : AMDGPUShaderInst < - (outs R600_TReg32_X:$dst0, R600_TReg32_Y:$dst1), - (ins i32imm:$src0, R600_TReg32_Y:$src1, R600_TReg32_X:$src2), - "INTERP_PAIR_XY $src0 $src1 $src2 : $dst0 dst1", - []>; - -def INTERP_PAIR_ZW : AMDGPUShaderInst < - (outs R600_TReg32_Z:$dst0, R600_TReg32_W:$dst1), - (ins i32imm:$src0, R600_TReg32_Y:$src1, R600_TReg32_X:$src2), - "INTERP_PAIR_ZW $src0 $src1 $src2 : $dst0 dst1", - []>; - -def CONST_ADDRESS: SDNode<"AMDGPUISD::CONST_ADDRESS", - SDTypeProfile<1, -1, [SDTCisInt<0>, SDTCisPtrTy<1>]>, - [SDNPVariadic] ->; - -def DOT4 : SDNode<"AMDGPUISD::DOT4", - SDTypeProfile<1, 8, [SDTCisFP<0>, SDTCisVT<1, f32>, SDTCisVT<2, f32>, - SDTCisVT<3, f32>, SDTCisVT<4, f32>, SDTCisVT<5, f32>, - SDTCisVT<6, f32>, SDTCisVT<7, f32>, SDTCisVT<8, f32>]>, - [] ->; - -def COS_HW : SDNode<"AMDGPUISD::COS_HW", - SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisFP<1>]> ->; - -def SIN_HW : SDNode<"AMDGPUISD::SIN_HW", - SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisFP<1>]> ->; - -def TEXTURE_FETCH_Type : SDTypeProfile<1, 19, [SDTCisFP<0>]>; - -def TEXTURE_FETCH: SDNode<"AMDGPUISD::TEXTURE_FETCH", TEXTURE_FETCH_Type, []>; - -multiclass TexPattern TextureOp, Instruction inst, ValueType vt = v4f32> { -def : Pat<(TEXTURE_FETCH (i32 TextureOp), vt:$SRC_GPR, - (i32 imm:$srcx), (i32 imm:$srcy), (i32 imm:$srcz), (i32 imm:$srcw), - (i32 imm:$offsetx), (i32 imm:$offsety), (i32 imm:$offsetz), - (i32 imm:$DST_SEL_X), (i32 imm:$DST_SEL_Y), (i32 imm:$DST_SEL_Z), - (i32 imm:$DST_SEL_W), - (i32 imm:$RESOURCE_ID), (i32 imm:$SAMPLER_ID), - (i32 imm:$COORD_TYPE_X), (i32 imm:$COORD_TYPE_Y), (i32 imm:$COORD_TYPE_Z), - (i32 imm:$COORD_TYPE_W)), - (inst R600_Reg128:$SRC_GPR, - imm:$srcx, imm:$srcy, imm:$srcz, imm:$srcw, - imm:$offsetx, imm:$offsety, imm:$offsetz, - imm:$DST_SEL_X, imm:$DST_SEL_Y, imm:$DST_SEL_Z, - imm:$DST_SEL_W, - imm:$RESOURCE_ID, imm:$SAMPLER_ID, - imm:$COORD_TYPE_X, imm:$COORD_TYPE_Y, imm:$COORD_TYPE_Z, - imm:$COORD_TYPE_W)>; -} - -//===----------------------------------------------------------------------===// -// Interpolation Instructions -//===----------------------------------------------------------------------===// - -def INTERP_VEC_LOAD : AMDGPUShaderInst < - (outs R600_Reg128:$dst), - (ins i32imm:$src0), - "INTERP_LOAD $src0 : $dst", - [(set R600_Reg128:$dst, (int_R600_interp_const imm:$src0))]>; - -def INTERP_XY : R600_2OP <0xD6, "INTERP_XY", []> { - let bank_swizzle = 5; -} - -def INTERP_ZW : R600_2OP <0xD7, "INTERP_ZW", []> { - let bank_swizzle = 5; -} - -def INTERP_LOAD_P0 : R600_1OP <0xE0, "INTERP_LOAD_P0", []>; - -//===----------------------------------------------------------------------===// -// Export Instructions -//===----------------------------------------------------------------------===// - -def ExportType : SDTypeProfile<0, 7, [SDTCisFP<0>, SDTCisInt<1>]>; - -def EXPORT: SDNode<"AMDGPUISD::EXPORT", ExportType, - [SDNPHasChain, SDNPSideEffect]>; - -class ExportWord0 { - field bits<32> Word0; - - bits<13> arraybase; - bits<2> type; - bits<7> gpr; - bits<2> elem_size; - - let Word0{12-0} = arraybase; - let Word0{14-13} = type; - let Word0{21-15} = gpr; - let Word0{22} = 0; // RW_REL - let Word0{29-23} = 0; // INDEX_GPR - let Word0{31-30} = elem_size; -} - -class ExportSwzWord1 { - field bits<32> Word1; - - bits<3> sw_x; - bits<3> sw_y; - bits<3> sw_z; - bits<3> sw_w; - bits<1> eop; - bits<8> inst; - - let Word1{2-0} = sw_x; - let Word1{5-3} = sw_y; - let Word1{8-6} = sw_z; - let Word1{11-9} = sw_w; -} - -class ExportBufWord1 { - field bits<32> Word1; - - bits<12> arraySize; - bits<4> compMask; - bits<1> eop; - bits<8> inst; - - let Word1{11-0} = arraySize; - let Word1{15-12} = compMask; -} - -multiclass ExportPattern cf_inst> { - def : Pat<(int_R600_store_pixel_depth R600_Reg32:$reg), - (ExportInst - (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), $reg, sub0), - 0, 61, 0, 7, 7, 7, cf_inst, 0) - >; - - def : Pat<(int_R600_store_pixel_stencil R600_Reg32:$reg), - (ExportInst - (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), $reg, sub0), - 0, 61, 7, 0, 7, 7, cf_inst, 0) - >; - - def : Pat<(int_R600_store_dummy (i32 imm:$type)), - (ExportInst - (v4f32 (IMPLICIT_DEF)), imm:$type, 0, 7, 7, 7, 7, cf_inst, 0) - >; - - def : Pat<(int_R600_store_dummy 1), - (ExportInst - (v4f32 (IMPLICIT_DEF)), 1, 60, 7, 7, 7, 7, cf_inst, 0) - >; - - def : Pat<(EXPORT (v4f32 R600_Reg128:$src), (i32 imm:$base), (i32 imm:$type), - (i32 imm:$swz_x), (i32 imm:$swz_y), (i32 imm:$swz_z), (i32 imm:$swz_w)), - (ExportInst R600_Reg128:$src, imm:$type, imm:$base, - imm:$swz_x, imm:$swz_y, imm:$swz_z, imm:$swz_w, cf_inst, 0) - >; - -} - -multiclass SteamOutputExportPattern buf0inst, bits<8> buf1inst, bits<8> buf2inst, bits<8> buf3inst> { -// Stream0 - def : Pat<(int_R600_store_stream_output (v4f32 R600_Reg128:$src), - (i32 imm:$arraybase), (i32 0), (i32 imm:$mask)), - (ExportInst R600_Reg128:$src, 0, imm:$arraybase, - 4095, imm:$mask, buf0inst, 0)>; -// Stream1 - def : Pat<(int_R600_store_stream_output (v4f32 R600_Reg128:$src), - (i32 imm:$arraybase), (i32 1), (i32 imm:$mask)), - (ExportInst $src, 0, imm:$arraybase, - 4095, imm:$mask, buf1inst, 0)>; -// Stream2 - def : Pat<(int_R600_store_stream_output (v4f32 R600_Reg128:$src), - (i32 imm:$arraybase), (i32 2), (i32 imm:$mask)), - (ExportInst $src, 0, imm:$arraybase, - 4095, imm:$mask, buf2inst, 0)>; -// Stream3 - def : Pat<(int_R600_store_stream_output (v4f32 R600_Reg128:$src), - (i32 imm:$arraybase), (i32 3), (i32 imm:$mask)), - (ExportInst $src, 0, imm:$arraybase, - 4095, imm:$mask, buf3inst, 0)>; -} - -// Export Instructions should not be duplicated by TailDuplication pass -// (which assumes that duplicable instruction are affected by exec mask) -let usesCustomInserter = 1, isNotDuplicable = 1 in { - -class ExportSwzInst : InstR600ISA<( - outs), - (ins R600_Reg128:$gpr, i32imm:$type, i32imm:$arraybase, - RSel:$sw_x, RSel:$sw_y, RSel:$sw_z, RSel:$sw_w, i32imm:$inst, - i32imm:$eop), - !strconcat("EXPORT", " $gpr.$sw_x$sw_y$sw_z$sw_w"), - []>, ExportWord0, ExportSwzWord1 { - let elem_size = 3; - let Inst{31-0} = Word0; - let Inst{63-32} = Word1; - let IsExport = 1; -} - -} // End usesCustomInserter = 1 - -class ExportBufInst : InstR600ISA<( - outs), - (ins R600_Reg128:$gpr, i32imm:$type, i32imm:$arraybase, - i32imm:$arraySize, i32imm:$compMask, i32imm:$inst, i32imm:$eop), - !strconcat("EXPORT", " $gpr"), - []>, ExportWord0, ExportBufWord1 { - let elem_size = 0; - let Inst{31-0} = Word0; - let Inst{63-32} = Word1; - let IsExport = 1; -} - -//===----------------------------------------------------------------------===// -// Control Flow Instructions -//===----------------------------------------------------------------------===// - - -def KCACHE : InstFlag<"printKCache">; - -class ALU_CLAUSE inst, string OpName> : AMDGPUInst <(outs), -(ins i32imm:$ADDR, i32imm:$KCACHE_BANK0, i32imm:$KCACHE_BANK1, -KCACHE:$KCACHE_MODE0, KCACHE:$KCACHE_MODE1, -i32imm:$KCACHE_ADDR0, i32imm:$KCACHE_ADDR1, -i32imm:$COUNT, i32imm:$Enabled), -!strconcat(OpName, " $COUNT, @$ADDR, " -"KC0[$KCACHE_MODE0], KC1[$KCACHE_MODE1]"), -[] >, CF_ALU_WORD0, CF_ALU_WORD1 { - field bits<64> Inst; - - let CF_INST = inst; - let ALT_CONST = 0; - let WHOLE_QUAD_MODE = 0; - let BARRIER = 1; - let isCodeGenOnly = 1; - let UseNamedOperandTable = 1; - - let Inst{31-0} = Word0; - let Inst{63-32} = Word1; -} - -class CF_WORD0_R600 { - field bits<32> Word0; - - bits<32> ADDR; - - let Word0 = ADDR; -} - -class CF_CLAUSE_R600 inst, dag ins, string AsmPrint> : AMDGPUInst <(outs), -ins, AsmPrint, [] >, CF_WORD0_R600, CF_WORD1_R600 { - field bits<64> Inst; - bits<4> CNT; - - let CF_INST = inst; - let BARRIER = 1; - let CF_CONST = 0; - let VALID_PIXEL_MODE = 0; - let COND = 0; - let COUNT = CNT{2-0}; - let CALL_COUNT = 0; - let COUNT_3 = CNT{3}; - let END_OF_PROGRAM = 0; - let WHOLE_QUAD_MODE = 0; - - let Inst{31-0} = Word0; - let Inst{63-32} = Word1; -} - -class CF_CLAUSE_EG inst, dag ins, string AsmPrint> : AMDGPUInst <(outs), -ins, AsmPrint, [] >, CF_WORD0_EG, CF_WORD1_EG { - field bits<64> Inst; - - let CF_INST = inst; - let BARRIER = 1; - let JUMPTABLE_SEL = 0; - let CF_CONST = 0; - let VALID_PIXEL_MODE = 0; - let COND = 0; - let END_OF_PROGRAM = 0; - - let Inst{31-0} = Word0; - let Inst{63-32} = Word1; -} - -def CF_ALU : ALU_CLAUSE<8, "ALU">; -def CF_ALU_PUSH_BEFORE : ALU_CLAUSE<9, "ALU_PUSH_BEFORE">; -def CF_ALU_POP_AFTER : ALU_CLAUSE<10, "ALU_POP_AFTER">; -def CF_ALU_CONTINUE : ALU_CLAUSE<13, "ALU_CONTINUE">; -def CF_ALU_BREAK : ALU_CLAUSE<14, "ALU_BREAK">; -def CF_ALU_ELSE_AFTER : ALU_CLAUSE<15, "ALU_ELSE_AFTER">; - -def FETCH_CLAUSE : AMDGPUInst <(outs), -(ins i32imm:$addr), "Fetch clause starting at $addr:", [] > { - field bits<8> Inst; - bits<8> num; - let Inst = num; - let isCodeGenOnly = 1; -} - -def ALU_CLAUSE : AMDGPUInst <(outs), -(ins i32imm:$addr), "ALU clause starting at $addr:", [] > { - field bits<8> Inst; - bits<8> num; - let Inst = num; - let isCodeGenOnly = 1; -} - -def LITERALS : AMDGPUInst <(outs), -(ins LITERAL:$literal1, LITERAL:$literal2), "$literal1, $literal2", [] > { - let isCodeGenOnly = 1; - - field bits<64> Inst; - bits<32> literal1; - bits<32> literal2; - - let Inst{31-0} = literal1; - let Inst{63-32} = literal2; -} - -def PAD : AMDGPUInst <(outs), (ins), "PAD", [] > { - field bits<64> Inst; -} - -let Predicates = [isR600toCayman] in { - -//===----------------------------------------------------------------------===// -// Common Instructions R600, R700, Evergreen, Cayman -//===----------------------------------------------------------------------===// - -def ADD : R600_2OP_Helper <0x0, "ADD", fadd>; -// Non-IEEE MUL: 0 * anything = 0 -def MUL : R600_2OP_Helper <0x1, "MUL NON-IEEE", int_AMDGPU_mul>; -def MUL_IEEE : R600_2OP_Helper <0x2, "MUL_IEEE", fmul>; -// TODO: Do these actually match the regular fmin/fmax behavior? -def MAX : R600_2OP_Helper <0x3, "MAX", AMDGPUfmax_legacy>; -def MIN : R600_2OP_Helper <0x4, "MIN", AMDGPUfmin_legacy>; -// According to https://msdn.microsoft.com/en-us/library/windows/desktop/cc308050%28v=vs.85%29.aspx -// DX10 min/max returns the other operand if one is NaN, -// this matches http://llvm.org/docs/LangRef.html#llvm-minnum-intrinsic -def MAX_DX10 : R600_2OP_Helper <0x5, "MAX_DX10", fmaxnum>; -def MIN_DX10 : R600_2OP_Helper <0x6, "MIN_DX10", fminnum>; - -// For the SET* instructions there is a naming conflict in TargetSelectionDAG.td, -// so some of the instruction names don't match the asm string. -// XXX: Use the defs in TargetSelectionDAG.td instead of intrinsics. -def SETE : R600_2OP < - 0x08, "SETE", - [(set f32:$dst, (selectcc f32:$src0, f32:$src1, FP_ONE, FP_ZERO, COND_OEQ))] ->; - -def SGT : R600_2OP < - 0x09, "SETGT", - [(set f32:$dst, (selectcc f32:$src0, f32:$src1, FP_ONE, FP_ZERO, COND_OGT))] ->; - -def SGE : R600_2OP < - 0xA, "SETGE", - [(set f32:$dst, (selectcc f32:$src0, f32:$src1, FP_ONE, FP_ZERO, COND_OGE))] ->; - -def SNE : R600_2OP < - 0xB, "SETNE", - [(set f32:$dst, (selectcc f32:$src0, f32:$src1, FP_ONE, FP_ZERO, COND_UNE_NE))] ->; - -def SETE_DX10 : R600_2OP < - 0xC, "SETE_DX10", - [(set i32:$dst, (selectcc f32:$src0, f32:$src1, -1, 0, COND_OEQ))] ->; - -def SETGT_DX10 : R600_2OP < - 0xD, "SETGT_DX10", - [(set i32:$dst, (selectcc f32:$src0, f32:$src1, -1, 0, COND_OGT))] ->; - -def SETGE_DX10 : R600_2OP < - 0xE, "SETGE_DX10", - [(set i32:$dst, (selectcc f32:$src0, f32:$src1, -1, 0, COND_OGE))] ->; - -// FIXME: This should probably be COND_ONE -def SETNE_DX10 : R600_2OP < - 0xF, "SETNE_DX10", - [(set i32:$dst, (selectcc f32:$src0, f32:$src1, -1, 0, COND_UNE_NE))] ->; - -def FRACT : R600_1OP_Helper <0x10, "FRACT", AMDGPUfract>; -def TRUNC : R600_1OP_Helper <0x11, "TRUNC", ftrunc>; -def CEIL : R600_1OP_Helper <0x12, "CEIL", fceil>; -def RNDNE : R600_1OP_Helper <0x13, "RNDNE", frint>; -def FLOOR : R600_1OP_Helper <0x14, "FLOOR", ffloor>; - -def MOV : R600_1OP <0x19, "MOV", []>; - -let isPseudo = 1, isCodeGenOnly = 1, usesCustomInserter = 1 in { - -class MOV_IMM : AMDGPUInst < - (outs R600_Reg32:$dst), - (ins immType:$imm), - "", - [] ->; - -} // end let isPseudo = 1, isCodeGenOnly = 1, usesCustomInserter = 1 - -def MOV_IMM_I32 : MOV_IMM; -def : Pat < - (imm:$val), - (MOV_IMM_I32 imm:$val) ->; - -def MOV_IMM_F32 : MOV_IMM; -def : Pat < - (fpimm:$val), - (MOV_IMM_F32 fpimm:$val) ->; - -def PRED_SETE : R600_2OP <0x20, "PRED_SETE", []>; -def PRED_SETGT : R600_2OP <0x21, "PRED_SETGT", []>; -def PRED_SETGE : R600_2OP <0x22, "PRED_SETGE", []>; -def PRED_SETNE : R600_2OP <0x23, "PRED_SETNE", []>; - -let hasSideEffects = 1 in { - -def KILLGT : R600_2OP <0x2D, "KILLGT", []>; - -} // end hasSideEffects - -def AND_INT : R600_2OP_Helper <0x30, "AND_INT", and>; -def OR_INT : R600_2OP_Helper <0x31, "OR_INT", or>; -def XOR_INT : R600_2OP_Helper <0x32, "XOR_INT", xor>; -def NOT_INT : R600_1OP_Helper <0x33, "NOT_INT", not>; -def ADD_INT : R600_2OP_Helper <0x34, "ADD_INT", add>; -def SUB_INT : R600_2OP_Helper <0x35, "SUB_INT", sub>; -def MAX_INT : R600_2OP_Helper <0x36, "MAX_INT", smax>; -def MIN_INT : R600_2OP_Helper <0x37, "MIN_INT", smin>; -def MAX_UINT : R600_2OP_Helper <0x38, "MAX_UINT", umax>; -def MIN_UINT : R600_2OP_Helper <0x39, "MIN_UINT", umin>; - -def SETE_INT : R600_2OP < - 0x3A, "SETE_INT", - [(set i32:$dst, (selectcc i32:$src0, i32:$src1, -1, 0, SETEQ))] ->; - -def SETGT_INT : R600_2OP < - 0x3B, "SETGT_INT", - [(set i32:$dst, (selectcc i32:$src0, i32:$src1, -1, 0, SETGT))] ->; - -def SETGE_INT : R600_2OP < - 0x3C, "SETGE_INT", - [(set i32:$dst, (selectcc i32:$src0, i32:$src1, -1, 0, SETGE))] ->; - -def SETNE_INT : R600_2OP < - 0x3D, "SETNE_INT", - [(set i32:$dst, (selectcc i32:$src0, i32:$src1, -1, 0, SETNE))] ->; - -def SETGT_UINT : R600_2OP < - 0x3E, "SETGT_UINT", - [(set i32:$dst, (selectcc i32:$src0, i32:$src1, -1, 0, SETUGT))] ->; - -def SETGE_UINT : R600_2OP < - 0x3F, "SETGE_UINT", - [(set i32:$dst, (selectcc i32:$src0, i32:$src1, -1, 0, SETUGE))] ->; - -def PRED_SETE_INT : R600_2OP <0x42, "PRED_SETE_INT", []>; -def PRED_SETGT_INT : R600_2OP <0x43, "PRED_SETGE_INT", []>; -def PRED_SETGE_INT : R600_2OP <0x44, "PRED_SETGE_INT", []>; -def PRED_SETNE_INT : R600_2OP <0x45, "PRED_SETNE_INT", []>; - -def CNDE_INT : R600_3OP < - 0x1C, "CNDE_INT", - [(set i32:$dst, (selectcc i32:$src0, 0, i32:$src1, i32:$src2, COND_EQ))] ->; - -def CNDGE_INT : R600_3OP < - 0x1E, "CNDGE_INT", - [(set i32:$dst, (selectcc i32:$src0, 0, i32:$src1, i32:$src2, COND_SGE))] ->; - -def CNDGT_INT : R600_3OP < - 0x1D, "CNDGT_INT", - [(set i32:$dst, (selectcc i32:$src0, 0, i32:$src1, i32:$src2, COND_SGT))] ->; - -//===----------------------------------------------------------------------===// -// Texture instructions -//===----------------------------------------------------------------------===// - -let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in { - -class R600_TEX inst, string opName> : - InstR600 <(outs R600_Reg128:$DST_GPR), - (ins R600_Reg128:$SRC_GPR, - RSel:$srcx, RSel:$srcy, RSel:$srcz, RSel:$srcw, - i32imm:$offsetx, i32imm:$offsety, i32imm:$offsetz, - RSel:$DST_SEL_X, RSel:$DST_SEL_Y, RSel:$DST_SEL_Z, RSel:$DST_SEL_W, - i32imm:$RESOURCE_ID, i32imm:$SAMPLER_ID, - CT:$COORD_TYPE_X, CT:$COORD_TYPE_Y, CT:$COORD_TYPE_Z, - CT:$COORD_TYPE_W), - !strconcat(opName, - " $DST_GPR.$DST_SEL_X$DST_SEL_Y$DST_SEL_Z$DST_SEL_W, " - "$SRC_GPR.$srcx$srcy$srcz$srcw " - "RID:$RESOURCE_ID SID:$SAMPLER_ID " - "CT:$COORD_TYPE_X$COORD_TYPE_Y$COORD_TYPE_Z$COORD_TYPE_W"), - [], - NullALU>, TEX_WORD0, TEX_WORD1, TEX_WORD2 { - let Inst{31-0} = Word0; - let Inst{63-32} = Word1; - - let TEX_INST = inst{4-0}; - let SRC_REL = 0; - let DST_REL = 0; - let LOD_BIAS = 0; - - let INST_MOD = 0; - let FETCH_WHOLE_QUAD = 0; - let ALT_CONST = 0; - let SAMPLER_INDEX_MODE = 0; - let RESOURCE_INDEX_MODE = 0; - - let TEXInst = 1; -} - -} // End mayLoad = 0, mayStore = 0, hasSideEffects = 0 - - - -def TEX_SAMPLE : R600_TEX <0x10, "TEX_SAMPLE">; -def TEX_SAMPLE_C : R600_TEX <0x18, "TEX_SAMPLE_C">; -def TEX_SAMPLE_L : R600_TEX <0x11, "TEX_SAMPLE_L">; -def TEX_SAMPLE_C_L : R600_TEX <0x19, "TEX_SAMPLE_C_L">; -def TEX_SAMPLE_LB : R600_TEX <0x12, "TEX_SAMPLE_LB">; -def TEX_SAMPLE_C_LB : R600_TEX <0x1A, "TEX_SAMPLE_C_LB">; -def TEX_LD : R600_TEX <0x03, "TEX_LD">; -def TEX_LDPTR : R600_TEX <0x03, "TEX_LDPTR"> { - let INST_MOD = 1; -} -def TEX_GET_TEXTURE_RESINFO : R600_TEX <0x04, "TEX_GET_TEXTURE_RESINFO">; -def TEX_GET_GRADIENTS_H : R600_TEX <0x07, "TEX_GET_GRADIENTS_H">; -def TEX_GET_GRADIENTS_V : R600_TEX <0x08, "TEX_GET_GRADIENTS_V">; -def TEX_SET_GRADIENTS_H : R600_TEX <0x0B, "TEX_SET_GRADIENTS_H">; -def TEX_SET_GRADIENTS_V : R600_TEX <0x0C, "TEX_SET_GRADIENTS_V">; -def TEX_SAMPLE_G : R600_TEX <0x14, "TEX_SAMPLE_G">; -def TEX_SAMPLE_C_G : R600_TEX <0x1C, "TEX_SAMPLE_C_G">; - -defm : TexPattern<0, TEX_SAMPLE>; -defm : TexPattern<1, TEX_SAMPLE_C>; -defm : TexPattern<2, TEX_SAMPLE_L>; -defm : TexPattern<3, TEX_SAMPLE_C_L>; -defm : TexPattern<4, TEX_SAMPLE_LB>; -defm : TexPattern<5, TEX_SAMPLE_C_LB>; -defm : TexPattern<6, TEX_LD, v4i32>; -defm : TexPattern<7, TEX_GET_TEXTURE_RESINFO, v4i32>; -defm : TexPattern<8, TEX_GET_GRADIENTS_H>; -defm : TexPattern<9, TEX_GET_GRADIENTS_V>; -defm : TexPattern<10, TEX_LDPTR, v4i32>; - -//===----------------------------------------------------------------------===// -// Helper classes for common instructions -//===----------------------------------------------------------------------===// - -class MUL_LIT_Common inst> : R600_3OP < - inst, "MUL_LIT", - [] ->; - -class MULADD_Common inst> : R600_3OP < - inst, "MULADD", - [] ->; - -class MULADD_IEEE_Common inst> : R600_3OP < - inst, "MULADD_IEEE", - [(set f32:$dst, (fmad f32:$src0, f32:$src1, f32:$src2))] ->; - -class FMA_Common inst> : R600_3OP < - inst, "FMA", - [(set f32:$dst, (fma f32:$src0, f32:$src1, f32:$src2))], VecALU ->; - -class CNDE_Common inst> : R600_3OP < - inst, "CNDE", - [(set f32:$dst, (selectcc f32:$src0, FP_ZERO, f32:$src1, f32:$src2, COND_OEQ))] ->; - -class CNDGT_Common inst> : R600_3OP < - inst, "CNDGT", - [(set f32:$dst, (selectcc f32:$src0, FP_ZERO, f32:$src1, f32:$src2, COND_OGT))] -> { - let Itinerary = VecALU; -} - -class CNDGE_Common inst> : R600_3OP < - inst, "CNDGE", - [(set f32:$dst, (selectcc f32:$src0, FP_ZERO, f32:$src1, f32:$src2, COND_OGE))] -> { - let Itinerary = VecALU; -} - - -let isCodeGenOnly = 1, isPseudo = 1, Namespace = "AMDGPU" in { -class R600_VEC2OP pattern> : InstR600 <(outs R600_Reg32:$dst), (ins -// Slot X - UEM:$update_exec_mask_X, UP:$update_pred_X, WRITE:$write_X, - OMOD:$omod_X, REL:$dst_rel_X, CLAMP:$clamp_X, - R600_TReg32_X:$src0_X, NEG:$src0_neg_X, REL:$src0_rel_X, ABS:$src0_abs_X, SEL:$src0_sel_X, - R600_TReg32_X:$src1_X, NEG:$src1_neg_X, REL:$src1_rel_X, ABS:$src1_abs_X, SEL:$src1_sel_X, - R600_Pred:$pred_sel_X, -// Slot Y - UEM:$update_exec_mask_Y, UP:$update_pred_Y, WRITE:$write_Y, - OMOD:$omod_Y, REL:$dst_rel_Y, CLAMP:$clamp_Y, - R600_TReg32_Y:$src0_Y, NEG:$src0_neg_Y, REL:$src0_rel_Y, ABS:$src0_abs_Y, SEL:$src0_sel_Y, - R600_TReg32_Y:$src1_Y, NEG:$src1_neg_Y, REL:$src1_rel_Y, ABS:$src1_abs_Y, SEL:$src1_sel_Y, - R600_Pred:$pred_sel_Y, -// Slot Z - UEM:$update_exec_mask_Z, UP:$update_pred_Z, WRITE:$write_Z, - OMOD:$omod_Z, REL:$dst_rel_Z, CLAMP:$clamp_Z, - R600_TReg32_Z:$src0_Z, NEG:$src0_neg_Z, REL:$src0_rel_Z, ABS:$src0_abs_Z, SEL:$src0_sel_Z, - R600_TReg32_Z:$src1_Z, NEG:$src1_neg_Z, REL:$src1_rel_Z, ABS:$src1_abs_Z, SEL:$src1_sel_Z, - R600_Pred:$pred_sel_Z, -// Slot W - UEM:$update_exec_mask_W, UP:$update_pred_W, WRITE:$write_W, - OMOD:$omod_W, REL:$dst_rel_W, CLAMP:$clamp_W, - R600_TReg32_W:$src0_W, NEG:$src0_neg_W, REL:$src0_rel_W, ABS:$src0_abs_W, SEL:$src0_sel_W, - R600_TReg32_W:$src1_W, NEG:$src1_neg_W, REL:$src1_rel_W, ABS:$src1_abs_W, SEL:$src1_sel_W, - R600_Pred:$pred_sel_W, - LITERAL:$literal0, LITERAL:$literal1), - "", - pattern, - AnyALU> { - - let UseNamedOperandTable = 1; - -} -} - -def DOT_4 : R600_VEC2OP<[(set R600_Reg32:$dst, (DOT4 - R600_TReg32_X:$src0_X, R600_TReg32_X:$src1_X, - R600_TReg32_Y:$src0_Y, R600_TReg32_Y:$src1_Y, - R600_TReg32_Z:$src0_Z, R600_TReg32_Z:$src1_Z, - R600_TReg32_W:$src0_W, R600_TReg32_W:$src1_W))]>; - - -class DOT4_Common inst> : R600_2OP ; - - -let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in { -multiclass CUBE_Common inst> { - - def _pseudo : InstR600 < - (outs R600_Reg128:$dst), - (ins R600_Reg128:$src0), - "CUBE $dst $src0", - [(set v4f32:$dst, (int_AMDGPU_cube v4f32:$src0))], - VecALU - > { - let isPseudo = 1; - let UseNamedOperandTable = 1; - } - - def _real : R600_2OP ; -} -} // End mayLoad = 0, mayStore = 0, hasSideEffects = 0 - -class EXP_IEEE_Common inst> : R600_1OP_Helper < - inst, "EXP_IEEE", fexp2 -> { - let Itinerary = TransALU; -} - -class FLT_TO_INT_Common inst> : R600_1OP_Helper < - inst, "FLT_TO_INT", fp_to_sint -> { - let Itinerary = TransALU; -} - -class INT_TO_FLT_Common inst> : R600_1OP_Helper < - inst, "INT_TO_FLT", sint_to_fp -> { - let Itinerary = TransALU; -} - -class FLT_TO_UINT_Common inst> : R600_1OP_Helper < - inst, "FLT_TO_UINT", fp_to_uint -> { - let Itinerary = TransALU; -} - -class UINT_TO_FLT_Common inst> : R600_1OP_Helper < - inst, "UINT_TO_FLT", uint_to_fp -> { - let Itinerary = TransALU; -} - -class LOG_CLAMPED_Common inst> : R600_1OP < - inst, "LOG_CLAMPED", [] ->; - -class LOG_IEEE_Common inst> : R600_1OP_Helper < - inst, "LOG_IEEE", flog2 -> { - let Itinerary = TransALU; -} - -class LSHL_Common inst> : R600_2OP_Helper ; -class LSHR_Common inst> : R600_2OP_Helper ; -class ASHR_Common inst> : R600_2OP_Helper ; -class MULHI_INT_Common inst> : R600_2OP_Helper < - inst, "MULHI_INT", mulhs -> { - let Itinerary = TransALU; -} -class MULHI_UINT_Common inst> : R600_2OP_Helper < - inst, "MULHI", mulhu -> { - let Itinerary = TransALU; -} -class MULLO_INT_Common inst> : R600_2OP_Helper < - inst, "MULLO_INT", mul -> { - let Itinerary = TransALU; -} -class MULLO_UINT_Common inst> : R600_2OP { - let Itinerary = TransALU; -} - -class RECIP_CLAMPED_Common inst> : R600_1OP < - inst, "RECIP_CLAMPED", [] -> { - let Itinerary = TransALU; -} - -class RECIP_IEEE_Common inst> : R600_1OP < - inst, "RECIP_IEEE", [(set f32:$dst, (AMDGPUrcp f32:$src0))] -> { - let Itinerary = TransALU; -} - -class RECIP_UINT_Common inst> : R600_1OP_Helper < - inst, "RECIP_UINT", AMDGPUurecip -> { - let Itinerary = TransALU; -} - -// Clamped to maximum. -class RECIPSQRT_CLAMPED_Common inst> : R600_1OP_Helper < - inst, "RECIPSQRT_CLAMPED", AMDGPUrsq_clamped -> { - let Itinerary = TransALU; -} - -class RECIPSQRT_IEEE_Common inst> : R600_1OP_Helper < - inst, "RECIPSQRT_IEEE", AMDGPUrsq_legacy -> { - let Itinerary = TransALU; -} - -// TODO: There is also RECIPSQRT_FF which clamps to zero. - -class SIN_Common inst> : R600_1OP < - inst, "SIN", [(set f32:$dst, (SIN_HW f32:$src0))]>{ - let Trig = 1; - let Itinerary = TransALU; -} - -class COS_Common inst> : R600_1OP < - inst, "COS", [(set f32:$dst, (COS_HW f32:$src0))]> { - let Trig = 1; - let Itinerary = TransALU; -} - -def CLAMP_R600 : CLAMP ; -def FABS_R600 : FABS; -def FNEG_R600 : FNEG; - -//===----------------------------------------------------------------------===// -// Helper patterns for complex intrinsics -//===----------------------------------------------------------------------===// - -// FIXME: Should be predicated on unsafe fp math. -multiclass DIV_Common { -def : Pat< - (int_AMDGPU_div f32:$src0, f32:$src1), - (MUL_IEEE $src0, (recip_ieee $src1)) ->; - -def : Pat< - (fdiv f32:$src0, f32:$src1), - (MUL_IEEE $src0, (recip_ieee $src1)) ->; - -def : RcpPat; -} - -class TGSI_LIT_Z_Common - : Pat < - (int_TGSI_lit_z f32:$src_x, f32:$src_y, f32:$src_w), - (exp_ieee (mul_lit (log_clamped (MAX $src_y, (f32 ZERO))), $src_w, $src_x)) ->; - -//===----------------------------------------------------------------------===// -// R600 / R700 Instructions -//===----------------------------------------------------------------------===// - -let Predicates = [isR600] in { - - def MUL_LIT_r600 : MUL_LIT_Common<0x0C>; - def MULADD_r600 : MULADD_Common<0x10>; - def MULADD_IEEE_r600 : MULADD_IEEE_Common<0x14>; - def CNDE_r600 : CNDE_Common<0x18>; - def CNDGT_r600 : CNDGT_Common<0x19>; - def CNDGE_r600 : CNDGE_Common<0x1A>; - def DOT4_r600 : DOT4_Common<0x50>; - defm CUBE_r600 : CUBE_Common<0x52>; - def EXP_IEEE_r600 : EXP_IEEE_Common<0x61>; - def LOG_CLAMPED_r600 : LOG_CLAMPED_Common<0x62>; - def LOG_IEEE_r600 : LOG_IEEE_Common<0x63>; - def RECIP_CLAMPED_r600 : RECIP_CLAMPED_Common<0x64>; - def RECIP_IEEE_r600 : RECIP_IEEE_Common<0x66>; - def RECIPSQRT_CLAMPED_r600 : RECIPSQRT_CLAMPED_Common<0x67>; - def RECIPSQRT_IEEE_r600 : RECIPSQRT_IEEE_Common<0x69>; - def FLT_TO_INT_r600 : FLT_TO_INT_Common<0x6b>; - def INT_TO_FLT_r600 : INT_TO_FLT_Common<0x6c>; - def FLT_TO_UINT_r600 : FLT_TO_UINT_Common<0x79>; - def UINT_TO_FLT_r600 : UINT_TO_FLT_Common<0x6d>; - def SIN_r600 : SIN_Common<0x6E>; - def COS_r600 : COS_Common<0x6F>; - def ASHR_r600 : ASHR_Common<0x70>; - def LSHR_r600 : LSHR_Common<0x71>; - def LSHL_r600 : LSHL_Common<0x72>; - def MULLO_INT_r600 : MULLO_INT_Common<0x73>; - def MULHI_INT_r600 : MULHI_INT_Common<0x74>; - def MULLO_UINT_r600 : MULLO_UINT_Common<0x75>; - def MULHI_UINT_r600 : MULHI_UINT_Common<0x76>; - def RECIP_UINT_r600 : RECIP_UINT_Common <0x78>; - - defm DIV_r600 : DIV_Common; - def : POW_Common ; - def TGSI_LIT_Z_r600 : TGSI_LIT_Z_Common; - - def : Pat<(fsqrt f32:$src), (MUL $src, (RECIPSQRT_CLAMPED_r600 $src))>; - def : RsqPat; - - def R600_ExportSwz : ExportSwzInst { - let Word1{20-17} = 0; // BURST_COUNT - let Word1{21} = eop; - let Word1{22} = 0; // VALID_PIXEL_MODE - let Word1{30-23} = inst; - let Word1{31} = 1; // BARRIER - } - defm : ExportPattern; - - def R600_ExportBuf : ExportBufInst { - let Word1{20-17} = 0; // BURST_COUNT - let Word1{21} = eop; - let Word1{22} = 0; // VALID_PIXEL_MODE - let Word1{30-23} = inst; - let Word1{31} = 1; // BARRIER - } - defm : SteamOutputExportPattern; - - def CF_TC_R600 : CF_CLAUSE_R600<1, (ins i32imm:$ADDR, i32imm:$CNT), - "TEX $CNT @$ADDR"> { - let POP_COUNT = 0; - } - def CF_VC_R600 : CF_CLAUSE_R600<2, (ins i32imm:$ADDR, i32imm:$CNT), - "VTX $CNT @$ADDR"> { - let POP_COUNT = 0; - } - def WHILE_LOOP_R600 : CF_CLAUSE_R600<6, (ins i32imm:$ADDR), - "LOOP_START_DX10 @$ADDR"> { - let POP_COUNT = 0; - let CNT = 0; - } - def END_LOOP_R600 : CF_CLAUSE_R600<5, (ins i32imm:$ADDR), "END_LOOP @$ADDR"> { - let POP_COUNT = 0; - let CNT = 0; - } - def LOOP_BREAK_R600 : CF_CLAUSE_R600<9, (ins i32imm:$ADDR), - "LOOP_BREAK @$ADDR"> { - let POP_COUNT = 0; - let CNT = 0; - } - def CF_CONTINUE_R600 : CF_CLAUSE_R600<8, (ins i32imm:$ADDR), - "CONTINUE @$ADDR"> { - let POP_COUNT = 0; - let CNT = 0; - } - def CF_JUMP_R600 : CF_CLAUSE_R600<10, (ins i32imm:$ADDR, i32imm:$POP_COUNT), - "JUMP @$ADDR POP:$POP_COUNT"> { - let CNT = 0; - } - def CF_PUSH_ELSE_R600 : CF_CLAUSE_R600<12, (ins i32imm:$ADDR), - "PUSH_ELSE @$ADDR"> { - let CNT = 0; - let POP_COUNT = 0; // FIXME? - } - def CF_ELSE_R600 : CF_CLAUSE_R600<13, (ins i32imm:$ADDR, i32imm:$POP_COUNT), - "ELSE @$ADDR POP:$POP_COUNT"> { - let CNT = 0; - } - def CF_CALL_FS_R600 : CF_CLAUSE_R600<19, (ins), "CALL_FS"> { - let ADDR = 0; - let CNT = 0; - let POP_COUNT = 0; - } - def POP_R600 : CF_CLAUSE_R600<14, (ins i32imm:$ADDR, i32imm:$POP_COUNT), - "POP @$ADDR POP:$POP_COUNT"> { - let CNT = 0; - } - def CF_END_R600 : CF_CLAUSE_R600<0, (ins), "CF_END"> { - let CNT = 0; - let POP_COUNT = 0; - let ADDR = 0; - let END_OF_PROGRAM = 1; - } - -} - - -//===----------------------------------------------------------------------===// -// Regist loads and stores - for indirect addressing -//===----------------------------------------------------------------------===// - -defm R600_ : RegisterLoadStore ; - - -//===----------------------------------------------------------------------===// -// Pseudo instructions -//===----------------------------------------------------------------------===// - -let isPseudo = 1 in { - -def PRED_X : InstR600 < - (outs R600_Predicate_Bit:$dst), - (ins R600_Reg32:$src0, i32imm:$src1, i32imm:$flags), - "", [], NullALU> { - let FlagOperandIdx = 3; -} - -let isTerminator = 1, isBranch = 1 in { -def JUMP_COND : InstR600 < - (outs), - (ins brtarget:$target, R600_Predicate_Bit:$p), - "JUMP $target ($p)", - [], AnyALU - >; - -def JUMP : InstR600 < - (outs), - (ins brtarget:$target), - "JUMP $target", - [], AnyALU - > -{ - let isPredicable = 1; - let isBarrier = 1; -} - -} // End isTerminator = 1, isBranch = 1 - -let usesCustomInserter = 1 in { - -let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in { - -def MASK_WRITE : AMDGPUShaderInst < - (outs), - (ins R600_Reg32:$src), - "MASK_WRITE $src", - [] ->; - -} // End mayLoad = 0, mayStore = 0, hasSideEffects = 1 - - -def TXD: InstR600 < - (outs R600_Reg128:$dst), - (ins R600_Reg128:$src0, R600_Reg128:$src1, R600_Reg128:$src2, - i32imm:$resourceId, i32imm:$samplerId, i32imm:$textureTarget), - "TXD $dst, $src0, $src1, $src2, $resourceId, $samplerId, $textureTarget", - [(set v4f32:$dst, (int_AMDGPU_txd v4f32:$src0, v4f32:$src1, v4f32:$src2, - imm:$resourceId, imm:$samplerId, imm:$textureTarget))], - NullALU > { - let TEXInst = 1; -} - -def TXD_SHADOW: InstR600 < - (outs R600_Reg128:$dst), - (ins R600_Reg128:$src0, R600_Reg128:$src1, R600_Reg128:$src2, - i32imm:$resourceId, i32imm:$samplerId, i32imm:$textureTarget), - "TXD_SHADOW $dst, $src0, $src1, $src2, $resourceId, $samplerId, $textureTarget", - [(set v4f32:$dst, (int_AMDGPU_txd v4f32:$src0, v4f32:$src1, v4f32:$src2, - imm:$resourceId, imm:$samplerId, TEX_SHADOW:$textureTarget))], - NullALU -> { - let TEXInst = 1; -} -} // End isPseudo = 1 -} // End usesCustomInserter = 1 - - -//===----------------------------------------------------------------------===// -// Constant Buffer Addressing Support -//===----------------------------------------------------------------------===// - -let usesCustomInserter = 1, isCodeGenOnly = 1, isPseudo = 1, Namespace = "AMDGPU" in { -def CONST_COPY : Instruction { - let OutOperandList = (outs R600_Reg32:$dst); - let InOperandList = (ins i32imm:$src); - let Pattern = - [(set R600_Reg32:$dst, (CONST_ADDRESS ADDRGA_CONST_OFFSET:$src))]; - let AsmString = "CONST_COPY"; - let hasSideEffects = 0; - let isAsCheapAsAMove = 1; - let Itinerary = NullALU; -} -} // end usesCustomInserter = 1, isCodeGenOnly = 1, isPseudo = 1, Namespace = "AMDGPU" - -def TEX_VTX_CONSTBUF : - InstR600ISA <(outs R600_Reg128:$dst), (ins MEMxi:$ptr, i32imm:$BUFFER_ID), "VTX_READ_eg $dst, $ptr", - [(set v4i32:$dst, (CONST_ADDRESS ADDRGA_VAR_OFFSET:$ptr, (i32 imm:$BUFFER_ID)))]>, - VTX_WORD1_GPR, VTX_WORD0_eg { - - let VC_INST = 0; - let FETCH_TYPE = 2; - let FETCH_WHOLE_QUAD = 0; - let SRC_REL = 0; - let SRC_SEL_X = 0; - let DST_REL = 0; - let USE_CONST_FIELDS = 0; - let NUM_FORMAT_ALL = 2; - let FORMAT_COMP_ALL = 1; - let SRF_MODE_ALL = 1; - let MEGA_FETCH_COUNT = 16; - let DST_SEL_X = 0; - let DST_SEL_Y = 1; - let DST_SEL_Z = 2; - let DST_SEL_W = 3; - let DATA_FORMAT = 35; - - let Inst{31-0} = Word0; - let Inst{63-32} = Word1; - -// LLVM can only encode 64-bit instructions, so these fields are manually -// encoded in R600CodeEmitter -// -// bits<16> OFFSET; -// bits<2> ENDIAN_SWAP = 0; -// bits<1> CONST_BUF_NO_STRIDE = 0; -// bits<1> MEGA_FETCH = 0; -// bits<1> ALT_CONST = 0; -// bits<2> BUFFER_INDEX_MODE = 0; - - - -// VTX_WORD2 (LLVM can only encode 64-bit instructions, so WORD2 encoding -// is done in R600CodeEmitter -// -// Inst{79-64} = OFFSET; -// Inst{81-80} = ENDIAN_SWAP; -// Inst{82} = CONST_BUF_NO_STRIDE; -// Inst{83} = MEGA_FETCH; -// Inst{84} = ALT_CONST; -// Inst{86-85} = BUFFER_INDEX_MODE; -// Inst{95-86} = 0; Reserved - -// VTX_WORD3 (Padding) -// -// Inst{127-96} = 0; - let VTXInst = 1; -} - -def TEX_VTX_TEXBUF: - InstR600ISA <(outs R600_Reg128:$dst), (ins MEMxi:$ptr, i32imm:$BUFFER_ID), "TEX_VTX_EXPLICIT_READ $dst, $ptr", - [(set v4f32:$dst, (int_R600_load_texbuf ADDRGA_VAR_OFFSET:$ptr, imm:$BUFFER_ID))]>, -VTX_WORD1_GPR, VTX_WORD0_eg { - -let VC_INST = 0; -let FETCH_TYPE = 2; -let FETCH_WHOLE_QUAD = 0; -let SRC_REL = 0; -let SRC_SEL_X = 0; -let DST_REL = 0; -let USE_CONST_FIELDS = 1; -let NUM_FORMAT_ALL = 0; -let FORMAT_COMP_ALL = 0; -let SRF_MODE_ALL = 1; -let MEGA_FETCH_COUNT = 16; -let DST_SEL_X = 0; -let DST_SEL_Y = 1; -let DST_SEL_Z = 2; -let DST_SEL_W = 3; -let DATA_FORMAT = 0; - -let Inst{31-0} = Word0; -let Inst{63-32} = Word1; - -// LLVM can only encode 64-bit instructions, so these fields are manually -// encoded in R600CodeEmitter -// -// bits<16> OFFSET; -// bits<2> ENDIAN_SWAP = 0; -// bits<1> CONST_BUF_NO_STRIDE = 0; -// bits<1> MEGA_FETCH = 0; -// bits<1> ALT_CONST = 0; -// bits<2> BUFFER_INDEX_MODE = 0; - - - -// VTX_WORD2 (LLVM can only encode 64-bit instructions, so WORD2 encoding -// is done in R600CodeEmitter -// -// Inst{79-64} = OFFSET; -// Inst{81-80} = ENDIAN_SWAP; -// Inst{82} = CONST_BUF_NO_STRIDE; -// Inst{83} = MEGA_FETCH; -// Inst{84} = ALT_CONST; -// Inst{86-85} = BUFFER_INDEX_MODE; -// Inst{95-86} = 0; Reserved - -// VTX_WORD3 (Padding) -// -// Inst{127-96} = 0; - let VTXInst = 1; -} - -//===---------------------------------------------------------------------===// -// Flow and Program control Instructions -//===---------------------------------------------------------------------===// -class ILFormat pattern> -: Instruction { - - let Namespace = "AMDGPU"; - dag OutOperandList = outs; - dag InOperandList = ins; - let Pattern = pattern; - let AsmString = !strconcat(asmstr, "\n"); - let isPseudo = 1; - let Itinerary = NullALU; - bit hasIEEEFlag = 0; - bit hasZeroOpFlag = 0; - let mayLoad = 0; - let mayStore = 0; - let hasSideEffects = 0; - let isCodeGenOnly = 1; -} - -multiclass BranchConditional { - def _i32 : ILFormat<(outs), - (ins brtarget:$target, rci:$src0), - "; i32 Pseudo branch instruction", - [(Op bb:$target, (i32 rci:$src0))]>; - def _f32 : ILFormat<(outs), - (ins brtarget:$target, rcf:$src0), - "; f32 Pseudo branch instruction", - [(Op bb:$target, (f32 rcf:$src0))]>; -} - -// Only scalar types should generate flow control -multiclass BranchInstr { - def _i32 : ILFormat<(outs), (ins R600_Reg32:$src), - !strconcat(name, " $src"), []>; - def _f32 : ILFormat<(outs), (ins R600_Reg32:$src), - !strconcat(name, " $src"), []>; -} -// Only scalar types should generate flow control -multiclass BranchInstr2 { - def _i32 : ILFormat<(outs), (ins R600_Reg32:$src0, R600_Reg32:$src1), - !strconcat(name, " $src0, $src1"), []>; - def _f32 : ILFormat<(outs), (ins R600_Reg32:$src0, R600_Reg32:$src1), - !strconcat(name, " $src0, $src1"), []>; -} - -//===---------------------------------------------------------------------===// -// Custom Inserter for Branches and returns, this eventually will be a -// separate pass -//===---------------------------------------------------------------------===// -let isTerminator = 1, usesCustomInserter = 1, isBranch = 1, isBarrier = 1 in { - def BRANCH : ILFormat<(outs), (ins brtarget:$target), - "; Pseudo unconditional branch instruction", - [(br bb:$target)]>; - defm BRANCH_COND : BranchConditional; -} - -//===---------------------------------------------------------------------===// -// Return instruction -//===---------------------------------------------------------------------===// -let isTerminator = 1, isReturn = 1, hasCtrlDep = 1, - usesCustomInserter = 1 in { - def RETURN : ILFormat<(outs), (ins variable_ops), - "RETURN", [(IL_retflag)]>; -} - -//===----------------------------------------------------------------------===// -// Branch Instructions -//===----------------------------------------------------------------------===// - -def IF_PREDICATE_SET : ILFormat<(outs), (ins R600_Reg32:$src), - "IF_PREDICATE_SET $src", []>; - -let isTerminator=1 in { - def BREAK : ILFormat< (outs), (ins), - "BREAK", []>; - def CONTINUE : ILFormat< (outs), (ins), - "CONTINUE", []>; - def DEFAULT : ILFormat< (outs), (ins), - "DEFAULT", []>; - def ELSE : ILFormat< (outs), (ins), - "ELSE", []>; - def ENDSWITCH : ILFormat< (outs), (ins), - "ENDSWITCH", []>; - def ENDMAIN : ILFormat< (outs), (ins), - "ENDMAIN", []>; - def END : ILFormat< (outs), (ins), - "END", []>; - def ENDFUNC : ILFormat< (outs), (ins), - "ENDFUNC", []>; - def ENDIF : ILFormat< (outs), (ins), - "ENDIF", []>; - def WHILELOOP : ILFormat< (outs), (ins), - "WHILE", []>; - def ENDLOOP : ILFormat< (outs), (ins), - "ENDLOOP", []>; - def FUNC : ILFormat< (outs), (ins), - "FUNC", []>; - def RETDYN : ILFormat< (outs), (ins), - "RET_DYN", []>; - // This opcode has custom swizzle pattern encoded in Swizzle Encoder - defm IF_LOGICALNZ : BranchInstr<"IF_LOGICALNZ">; - // This opcode has custom swizzle pattern encoded in Swizzle Encoder - defm IF_LOGICALZ : BranchInstr<"IF_LOGICALZ">; - // This opcode has custom swizzle pattern encoded in Swizzle Encoder - defm BREAK_LOGICALNZ : BranchInstr<"BREAK_LOGICALNZ">; - // This opcode has custom swizzle pattern encoded in Swizzle Encoder - defm BREAK_LOGICALZ : BranchInstr<"BREAK_LOGICALZ">; - // This opcode has custom swizzle pattern encoded in Swizzle Encoder - defm CONTINUE_LOGICALNZ : BranchInstr<"CONTINUE_LOGICALNZ">; - // This opcode has custom swizzle pattern encoded in Swizzle Encoder - defm CONTINUE_LOGICALZ : BranchInstr<"CONTINUE_LOGICALZ">; - defm IFC : BranchInstr2<"IFC">; - defm BREAKC : BranchInstr2<"BREAKC">; - defm CONTINUEC : BranchInstr2<"CONTINUEC">; -} - -//===----------------------------------------------------------------------===// -// Indirect addressing pseudo instructions -//===----------------------------------------------------------------------===// - -let isPseudo = 1 in { - -class ExtractVertical : InstR600 < - (outs R600_Reg32:$dst), - (ins vec_rc:$vec, R600_Reg32:$index), "", - [], - AnyALU ->; - -let Constraints = "$dst = $vec" in { - -class InsertVertical : InstR600 < - (outs vec_rc:$dst), - (ins vec_rc:$vec, R600_Reg32:$value, R600_Reg32:$index), "", - [], - AnyALU ->; - -} // End Constraints = "$dst = $vec" - -} // End isPseudo = 1 - -def R600_EXTRACT_ELT_V2 : ExtractVertical ; -def R600_EXTRACT_ELT_V4 : ExtractVertical ; - -def R600_INSERT_ELT_V2 : InsertVertical ; -def R600_INSERT_ELT_V4 : InsertVertical ; - -class ExtractVerticalPat : Pat < - (scalar_ty (extractelt vec_ty:$vec, i32:$index)), - (inst $vec, $index) ->; - -def : ExtractVerticalPat ; -def : ExtractVerticalPat ; -def : ExtractVerticalPat ; -def : ExtractVerticalPat ; - -class InsertVerticalPat : Pat < - (vec_ty (insertelt vec_ty:$vec, scalar_ty:$value, i32:$index)), - (inst $vec, $value, $index) ->; - -def : InsertVerticalPat ; -def : InsertVerticalPat ; -def : InsertVerticalPat ; -def : InsertVerticalPat ; - -//===----------------------------------------------------------------------===// -// ISel Patterns -//===----------------------------------------------------------------------===// - -// CND*_INT Pattterns for f32 True / False values - -class CND_INT_f32 : Pat < - (selectcc i32:$src0, 0, f32:$src1, f32:$src2, cc), - (cnd $src0, $src1, $src2) ->; - -def : CND_INT_f32 ; -def : CND_INT_f32 ; -def : CND_INT_f32 ; - -//CNDGE_INT extra pattern -def : Pat < - (selectcc i32:$src0, -1, i32:$src1, i32:$src2, COND_SGT), - (CNDGE_INT $src0, $src1, $src2) ->; - -// KIL Patterns -def KILP : Pat < - (int_AMDGPU_kilp), - (MASK_WRITE (KILLGT (f32 ONE), (f32 ZERO))) ->; - -def KIL : Pat < - (int_AMDGPU_kill f32:$src0), - (MASK_WRITE (KILLGT (f32 ZERO), $src0)) ->; - -def : Extract_Element ; -def : Extract_Element ; -def : Extract_Element ; -def : Extract_Element ; - -def : Insert_Element ; -def : Insert_Element ; -def : Insert_Element ; -def : Insert_Element ; - -def : Extract_Element ; -def : Extract_Element ; -def : Extract_Element ; -def : Extract_Element ; - -def : Insert_Element ; -def : Insert_Element ; -def : Insert_Element ; -def : Insert_Element ; - -def : Extract_Element ; -def : Extract_Element ; - -def : Insert_Element ; -def : Insert_Element ; - -def : Extract_Element ; -def : Extract_Element ; - -def : Insert_Element ; -def : Insert_Element ; - -// bitconvert patterns - -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; - -// DWORDADDR pattern -def : DwordAddrPat ; - -} // End isR600toCayman Predicate - -let Predicates = [isR600] in { -// Intrinsic patterns -defm : Expand24IBitOps; -defm : Expand24UBitOps; -} // End isR600 - -def getLDSNoRetOp : InstrMapping { - let FilterClass = "R600_LDS_1A1D"; - let RowFields = ["BaseOp"]; - let ColFields = ["DisableEncoding"]; - let KeyCol = ["$dst"]; - let ValueCols = [[""""]]; -} Index: projects/clang370-import/contrib/llvm/lib/Target/R600/AMDGPUMachineFunction.cpp =================================================================== --- projects/clang370-import/contrib/llvm/lib/Target/R600/AMDGPUMachineFunction.cpp (revision 288077) +++ projects/clang370-import/contrib/llvm/lib/Target/R600/AMDGPUMachineFunction.cpp (nonexistent) @@ -1,25 +0,0 @@ -#include "AMDGPUMachineFunction.h" -#include "AMDGPU.h" -#include "llvm/IR/Attributes.h" -#include "llvm/IR/Function.h" -using namespace llvm; - -static const char *const ShaderTypeAttribute = "ShaderType"; - -// Pin the vtable to this file. -void AMDGPUMachineFunction::anchor() {} - -AMDGPUMachineFunction::AMDGPUMachineFunction(const MachineFunction &MF) : - MachineFunctionInfo(), - ShaderType(ShaderType::COMPUTE), - LDSSize(0), - ScratchSize(0), - IsKernel(true) { - Attribute A = MF.getFunction()->getFnAttribute(ShaderTypeAttribute); - - if (A.isStringAttribute()) { - StringRef Str = A.getValueAsString(); - if (Str.getAsInteger(0, ShaderType)) - llvm_unreachable("Can't parse shader type!"); - } -} Index: projects/clang370-import/contrib/llvm/lib/Target/R600/R700Instructions.td =================================================================== --- projects/clang370-import/contrib/llvm/lib/Target/R600/R700Instructions.td (revision 288077) +++ projects/clang370-import/contrib/llvm/lib/Target/R600/R700Instructions.td (nonexistent) @@ -1,21 +0,0 @@ -//===-- R700Instructions.td - R700 Instruction defs -------*- tablegen -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// TableGen definitions for instructions which are: -// - Available to R700 and newer VLIW4/VLIW5 GPUs -// - Available only on R700 family GPUs. -// -//===----------------------------------------------------------------------===// - -def isR700 : Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::R700">; - -let Predicates = [isR700] in { - def SIN_r700 : SIN_Common<0x6E>; - def COS_r700 : COS_Common<0x6F>; -} Index: projects/clang370-import/contrib/llvm/lib/Target/R600/SILowerI1Copies.cpp =================================================================== --- projects/clang370-import/contrib/llvm/lib/Target/R600/SILowerI1Copies.cpp (revision 288077) +++ projects/clang370-import/contrib/llvm/lib/Target/R600/SILowerI1Copies.cpp (nonexistent) @@ -1,151 +0,0 @@ -//===-- SILowerI1Copies.cpp - Lower I1 Copies -----------------------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -/// i1 values are usually inserted by the CFG Structurize pass and they are -/// unique in that they can be copied from VALU to SALU registers. -/// This is not possible for any other value type. Since there are no -/// MOV instructions for i1, we to use V_CMP_* and V_CNDMASK to move the i1. -/// -//===----------------------------------------------------------------------===// -// - -#define DEBUG_TYPE "si-i1-copies" -#include "AMDGPU.h" -#include "AMDGPUSubtarget.h" -#include "SIInstrInfo.h" -#include "llvm/CodeGen/LiveIntervalAnalysis.h" -#include "llvm/CodeGen/MachineDominators.h" -#include "llvm/CodeGen/MachineFunctionPass.h" -#include "llvm/CodeGen/MachineInstrBuilder.h" -#include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/IR/LLVMContext.h" -#include "llvm/IR/Function.h" -#include "llvm/Support/Debug.h" -#include "llvm/Target/TargetMachine.h" - -using namespace llvm; - -namespace { - -class SILowerI1Copies : public MachineFunctionPass { -public: - static char ID; - -public: - SILowerI1Copies() : MachineFunctionPass(ID) { - initializeSILowerI1CopiesPass(*PassRegistry::getPassRegistry()); - } - - bool runOnMachineFunction(MachineFunction &MF) override; - - const char *getPassName() const override { - return "SI Lower i1 Copies"; - } - - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired(); - AU.setPreservesCFG(); - MachineFunctionPass::getAnalysisUsage(AU); - } -}; - -} // End anonymous namespace. - -INITIALIZE_PASS_BEGIN(SILowerI1Copies, DEBUG_TYPE, - "SI Lower i1 Copies", false, false) -INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) -INITIALIZE_PASS_END(SILowerI1Copies, DEBUG_TYPE, - "SI Lower i1 Copies", false, false) - -char SILowerI1Copies::ID = 0; - -char &llvm::SILowerI1CopiesID = SILowerI1Copies::ID; - -FunctionPass *llvm::createSILowerI1CopiesPass() { - return new SILowerI1Copies(); -} - -bool SILowerI1Copies::runOnMachineFunction(MachineFunction &MF) { - MachineRegisterInfo &MRI = MF.getRegInfo(); - const SIInstrInfo *TII = - static_cast(MF.getSubtarget().getInstrInfo()); - const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); - std::vector I1Defs; - - for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); - BI != BE; ++BI) { - - MachineBasicBlock &MBB = *BI; - MachineBasicBlock::iterator I, Next; - for (I = MBB.begin(); I != MBB.end(); I = Next) { - Next = std::next(I); - MachineInstr &MI = *I; - - if (MI.getOpcode() == AMDGPU::IMPLICIT_DEF) { - unsigned Reg = MI.getOperand(0).getReg(); - const TargetRegisterClass *RC = MRI.getRegClass(Reg); - if (RC == &AMDGPU::VReg_1RegClass) - MRI.setRegClass(Reg, &AMDGPU::SReg_64RegClass); - continue; - } - - if (MI.getOpcode() != AMDGPU::COPY) - continue; - - const MachineOperand &Dst = MI.getOperand(0); - const MachineOperand &Src = MI.getOperand(1); - - if (!TargetRegisterInfo::isVirtualRegister(Src.getReg()) || - !TargetRegisterInfo::isVirtualRegister(Dst.getReg())) - continue; - - const TargetRegisterClass *DstRC = MRI.getRegClass(Dst.getReg()); - const TargetRegisterClass *SrcRC = MRI.getRegClass(Src.getReg()); - - if (DstRC == &AMDGPU::VReg_1RegClass && - TRI->getCommonSubClass(SrcRC, &AMDGPU::SGPR_64RegClass)) { - I1Defs.push_back(Dst.getReg()); - DebugLoc DL = MI.getDebugLoc(); - - MachineInstr *DefInst = MRI.getUniqueVRegDef(Src.getReg()); - if (DefInst->getOpcode() == AMDGPU::S_MOV_B64) { - if (DefInst->getOperand(1).isImm()) { - I1Defs.push_back(Dst.getReg()); - - int64_t Val = DefInst->getOperand(1).getImm(); - assert(Val == 0 || Val == -1); - - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_MOV_B32_e32)) - .addOperand(Dst) - .addImm(Val); - MI.eraseFromParent(); - continue; - } - } - - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64)) - .addOperand(Dst) - .addImm(0) - .addImm(-1) - .addOperand(Src); - MI.eraseFromParent(); - } else if (TRI->getCommonSubClass(DstRC, &AMDGPU::SGPR_64RegClass) && - SrcRC == &AMDGPU::VReg_1RegClass) { - BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(AMDGPU::V_CMP_NE_I32_e64)) - .addOperand(Dst) - .addOperand(Src) - .addImm(0); - MI.eraseFromParent(); - } - } - } - - for (unsigned Reg : I1Defs) - MRI.setRegClass(Reg, &AMDGPU::VGPR_32RegClass); - - return false; -} Index: projects/clang370-import/contrib/llvm/lib/Target/R600/AMDGPUSubtarget.h =================================================================== --- projects/clang370-import/contrib/llvm/lib/Target/R600/AMDGPUSubtarget.h (revision 288077) +++ projects/clang370-import/contrib/llvm/lib/Target/R600/AMDGPUSubtarget.h (nonexistent) @@ -1,281 +0,0 @@ -//=====-- AMDGPUSubtarget.h - Define Subtarget for the AMDIL ---*- C++ -*-====// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//==-----------------------------------------------------------------------===// -// -/// \file -/// \brief AMDGPU specific subclass of TargetSubtarget. -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIB_TARGET_R600_AMDGPUSUBTARGET_H -#define LLVM_LIB_TARGET_R600_AMDGPUSUBTARGET_H -#include "AMDGPU.h" -#include "AMDGPUFrameLowering.h" -#include "AMDGPUInstrInfo.h" -#include "AMDGPUIntrinsicInfo.h" -#include "AMDGPUSubtarget.h" -#include "R600ISelLowering.h" -#include "llvm/ADT/StringExtras.h" -#include "llvm/ADT/StringRef.h" -#include "llvm/Target/TargetSubtargetInfo.h" - -#define GET_SUBTARGETINFO_HEADER -#include "AMDGPUGenSubtargetInfo.inc" - -namespace llvm { - -class SIMachineFunctionInfo; - -class AMDGPUSubtarget : public AMDGPUGenSubtargetInfo { - -public: - enum Generation { - R600 = 0, - R700, - EVERGREEN, - NORTHERN_ISLANDS, - SOUTHERN_ISLANDS, - SEA_ISLANDS, - VOLCANIC_ISLANDS, - }; - - enum { - FIXED_SGPR_COUNT_FOR_INIT_BUG = 80 - }; - -private: - std::string DevName; - bool Is64bit; - bool DumpCode; - bool R600ALUInst; - bool HasVertexCache; - short TexVTXClauseSize; - Generation Gen; - bool FP64; - bool FP64Denormals; - bool FP32Denormals; - bool FastFMAF32; - bool CaymanISA; - bool FlatAddressSpace; - bool EnableIRStructurizer; - bool EnablePromoteAlloca; - bool EnableIfCvt; - bool EnableLoadStoreOpt; - unsigned WavefrontSize; - bool CFALUBug; - int LocalMemorySize; - bool EnableVGPRSpilling; - bool SGPRInitBug; - bool IsGCN; - bool GCN1Encoding; - bool GCN3Encoding; - bool CIInsts; - bool FeatureDisable; - int LDSBankCount; - - AMDGPUFrameLowering FrameLowering; - std::unique_ptr TLInfo; - std::unique_ptr InstrInfo; - InstrItineraryData InstrItins; - Triple TargetTriple; - -public: - AMDGPUSubtarget(StringRef TT, StringRef CPU, StringRef FS, TargetMachine &TM); - AMDGPUSubtarget &initializeSubtargetDependencies(StringRef TT, StringRef GPU, - StringRef FS); - - const AMDGPUFrameLowering *getFrameLowering() const override { - return &FrameLowering; - } - const AMDGPUInstrInfo *getInstrInfo() const override { - return InstrInfo.get(); - } - const AMDGPURegisterInfo *getRegisterInfo() const override { - return &InstrInfo->getRegisterInfo(); - } - AMDGPUTargetLowering *getTargetLowering() const override { - return TLInfo.get(); - } - const InstrItineraryData *getInstrItineraryData() const override { - return &InstrItins; - } - - void ParseSubtargetFeatures(StringRef CPU, StringRef FS); - - bool is64bit() const { - return Is64bit; - } - - bool hasVertexCache() const { - return HasVertexCache; - } - - short getTexVTXClauseSize() const { - return TexVTXClauseSize; - } - - Generation getGeneration() const { - return Gen; - } - - bool hasHWFP64() const { - return FP64; - } - - bool hasCaymanISA() const { - return CaymanISA; - } - - bool hasFP32Denormals() const { - return FP32Denormals; - } - - bool hasFP64Denormals() const { - return FP64Denormals; - } - - bool hasFastFMAF32() const { - return FastFMAF32; - } - - bool hasFlatAddressSpace() const { - return FlatAddressSpace; - } - - bool hasBFE() const { - return (getGeneration() >= EVERGREEN); - } - - bool hasBFI() const { - return (getGeneration() >= EVERGREEN); - } - - bool hasBFM() const { - return hasBFE(); - } - - bool hasBCNT(unsigned Size) const { - if (Size == 32) - return (getGeneration() >= EVERGREEN); - - if (Size == 64) - return (getGeneration() >= SOUTHERN_ISLANDS); - - return false; - } - - bool hasMulU24() const { - return (getGeneration() >= EVERGREEN); - } - - bool hasMulI24() const { - return (getGeneration() >= SOUTHERN_ISLANDS || - hasCaymanISA()); - } - - bool hasFFBL() const { - return (getGeneration() >= EVERGREEN); - } - - bool hasFFBH() const { - return (getGeneration() >= EVERGREEN); - } - - bool hasCARRY() const { - return (getGeneration() >= EVERGREEN); - } - - bool hasBORROW() const { - return (getGeneration() >= EVERGREEN); - } - - bool IsIRStructurizerEnabled() const { - return EnableIRStructurizer; - } - - bool isPromoteAllocaEnabled() const { - return EnablePromoteAlloca; - } - - bool isIfCvtEnabled() const { - return EnableIfCvt; - } - - bool loadStoreOptEnabled() const { - return EnableLoadStoreOpt; - } - - unsigned getWavefrontSize() const { - return WavefrontSize; - } - - unsigned getStackEntrySize() const; - - bool hasCFAluBug() const { - assert(getGeneration() <= NORTHERN_ISLANDS); - return CFALUBug; - } - - int getLocalMemorySize() const { - return LocalMemorySize; - } - - bool hasSGPRInitBug() const { - return SGPRInitBug; - } - - int getLDSBankCount() const { - return LDSBankCount; - } - - unsigned getAmdKernelCodeChipID() const; - - bool enableMachineScheduler() const override { - return true; - } - - void overrideSchedPolicy(MachineSchedPolicy &Policy, - MachineInstr *begin, MachineInstr *end, - unsigned NumRegionInstrs) const override; - - // Helper functions to simplify if statements - bool isTargetELF() const { - return false; - } - - StringRef getDeviceName() const { - return DevName; - } - - bool dumpCode() const { - return DumpCode; - } - bool r600ALUEncoding() const { - return R600ALUInst; - } - bool isAmdHsaOS() const { - return TargetTriple.getOS() == Triple::AMDHSA; - } - bool isVGPRSpillingEnabled(const SIMachineFunctionInfo *MFI) const; - - unsigned getMaxWavesPerCU() const { - if (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) - return 10; - - // FIXME: Not sure what this is for other subtagets. - llvm_unreachable("do not know max waves per CU for this subtarget."); - } - - bool enableSubRegLiveness() const override { - return true; - } -}; - -} // End namespace llvm - -#endif Index: projects/clang370-import/contrib/llvm/lib/Target/R600/SIISelLowering.cpp =================================================================== --- projects/clang370-import/contrib/llvm/lib/Target/R600/SIISelLowering.cpp (revision 288077) +++ projects/clang370-import/contrib/llvm/lib/Target/R600/SIISelLowering.cpp (nonexistent) @@ -1,2241 +0,0 @@ -//===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -/// \file -/// \brief Custom DAG lowering for SI -// -//===----------------------------------------------------------------------===// - -#ifdef _MSC_VER -// Provide M_PI. -#define _USE_MATH_DEFINES -#include -#endif - -#include "SIISelLowering.h" -#include "AMDGPU.h" -#include "AMDGPUIntrinsicInfo.h" -#include "AMDGPUSubtarget.h" -#include "SIInstrInfo.h" -#include "SIMachineFunctionInfo.h" -#include "SIRegisterInfo.h" -#include "llvm/ADT/BitVector.h" -#include "llvm/CodeGen/CallingConvLower.h" -#include "llvm/CodeGen/MachineInstrBuilder.h" -#include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/CodeGen/SelectionDAG.h" -#include "llvm/IR/Function.h" -#include "llvm/ADT/SmallString.h" - -using namespace llvm; - -SITargetLowering::SITargetLowering(TargetMachine &TM, - const AMDGPUSubtarget &STI) - : AMDGPUTargetLowering(TM, STI) { - addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass); - addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass); - - addRegisterClass(MVT::v32i8, &AMDGPU::SReg_256RegClass); - addRegisterClass(MVT::v64i8, &AMDGPU::SReg_512RegClass); - - addRegisterClass(MVT::i32, &AMDGPU::SReg_32RegClass); - addRegisterClass(MVT::f32, &AMDGPU::VGPR_32RegClass); - - addRegisterClass(MVT::f64, &AMDGPU::VReg_64RegClass); - addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass); - addRegisterClass(MVT::v2f32, &AMDGPU::VReg_64RegClass); - - addRegisterClass(MVT::v4i32, &AMDGPU::SReg_128RegClass); - addRegisterClass(MVT::v4f32, &AMDGPU::VReg_128RegClass); - - addRegisterClass(MVT::v8i32, &AMDGPU::SReg_256RegClass); - addRegisterClass(MVT::v8f32, &AMDGPU::VReg_256RegClass); - - addRegisterClass(MVT::v16i32, &AMDGPU::SReg_512RegClass); - addRegisterClass(MVT::v16f32, &AMDGPU::VReg_512RegClass); - - computeRegisterProperties(STI.getRegisterInfo()); - - setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i32, Expand); - setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8f32, Expand); - setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i32, Expand); - setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16f32, Expand); - - setOperationAction(ISD::ADD, MVT::i32, Legal); - setOperationAction(ISD::ADDC, MVT::i32, Legal); - setOperationAction(ISD::ADDE, MVT::i32, Legal); - setOperationAction(ISD::SUBC, MVT::i32, Legal); - setOperationAction(ISD::SUBE, MVT::i32, Legal); - - setOperationAction(ISD::FSIN, MVT::f32, Custom); - setOperationAction(ISD::FCOS, MVT::f32, Custom); - - setOperationAction(ISD::FMINNUM, MVT::f64, Legal); - setOperationAction(ISD::FMAXNUM, MVT::f64, Legal); - - // We need to custom lower vector stores from local memory - setOperationAction(ISD::LOAD, MVT::v4i32, Custom); - setOperationAction(ISD::LOAD, MVT::v8i32, Custom); - setOperationAction(ISD::LOAD, MVT::v16i32, Custom); - - setOperationAction(ISD::STORE, MVT::v8i32, Custom); - setOperationAction(ISD::STORE, MVT::v16i32, Custom); - - setOperationAction(ISD::STORE, MVT::i1, Custom); - setOperationAction(ISD::STORE, MVT::v4i32, Custom); - - setOperationAction(ISD::SELECT, MVT::i64, Custom); - setOperationAction(ISD::SELECT, MVT::f64, Promote); - AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64); - - setOperationAction(ISD::SELECT_CC, MVT::f32, Expand); - setOperationAction(ISD::SELECT_CC, MVT::i32, Expand); - setOperationAction(ISD::SELECT_CC, MVT::i64, Expand); - setOperationAction(ISD::SELECT_CC, MVT::f64, Expand); - - setOperationAction(ISD::SETCC, MVT::v2i1, Expand); - setOperationAction(ISD::SETCC, MVT::v4i1, Expand); - - setOperationAction(ISD::BSWAP, MVT::i32, Legal); - - setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Legal); - setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Custom); - setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i1, Custom); - - setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Legal); - setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Custom); - setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Custom); - - setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Legal); - setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Custom); - setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Custom); - - setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal); - setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Custom); - - setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); - setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f32, Custom); - setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v16i8, Custom); - setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v4f32, Custom); - - setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); - setOperationAction(ISD::BRCOND, MVT::Other, Custom); - - for (MVT VT : MVT::integer_valuetypes()) { - if (VT == MVT::i64) - continue; - - setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote); - setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Legal); - setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i16, Legal); - setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i32, Expand); - - setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote); - setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i8, Legal); - setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i16, Legal); - setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i32, Expand); - - setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote); - setLoadExtAction(ISD::EXTLOAD, VT, MVT::i8, Legal); - setLoadExtAction(ISD::EXTLOAD, VT, MVT::i16, Legal); - setLoadExtAction(ISD::EXTLOAD, VT, MVT::i32, Expand); - } - - for (MVT VT : MVT::integer_vector_valuetypes()) { - setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v8i16, Expand); - setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v16i16, Expand); - } - - for (MVT VT : MVT::fp_valuetypes()) - setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand); - - setTruncStoreAction(MVT::i64, MVT::i32, Expand); - setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand); - setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand); - - setOperationAction(ISD::LOAD, MVT::i1, Custom); - - setOperationAction(ISD::GlobalAddress, MVT::i32, Custom); - setOperationAction(ISD::GlobalAddress, MVT::i64, Custom); - setOperationAction(ISD::FrameIndex, MVT::i32, Custom); - - // These should use UDIVREM, so set them to expand - setOperationAction(ISD::UDIV, MVT::i64, Expand); - setOperationAction(ISD::UREM, MVT::i64, Expand); - - setOperationAction(ISD::SELECT_CC, MVT::i1, Expand); - setOperationAction(ISD::SELECT, MVT::i1, Promote); - - // We only support LOAD/STORE and vector manipulation ops for vectors - // with > 4 elements. - for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32}) { - for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) { - switch(Op) { - case ISD::LOAD: - case ISD::STORE: - case ISD::BUILD_VECTOR: - case ISD::BITCAST: - case ISD::EXTRACT_VECTOR_ELT: - case ISD::INSERT_VECTOR_ELT: - case ISD::INSERT_SUBVECTOR: - case ISD::EXTRACT_SUBVECTOR: - break; - case ISD::CONCAT_VECTORS: - setOperationAction(Op, VT, Custom); - break; - default: - setOperationAction(Op, VT, Expand); - break; - } - } - } - - if (Subtarget->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { - setOperationAction(ISD::FTRUNC, MVT::f64, Legal); - setOperationAction(ISD::FCEIL, MVT::f64, Legal); - setOperationAction(ISD::FRINT, MVT::f64, Legal); - } - - setOperationAction(ISD::FFLOOR, MVT::f64, Legal); - setOperationAction(ISD::FDIV, MVT::f32, Custom); - setOperationAction(ISD::FDIV, MVT::f64, Custom); - - setTargetDAGCombine(ISD::FADD); - setTargetDAGCombine(ISD::FSUB); - setTargetDAGCombine(ISD::FMINNUM); - setTargetDAGCombine(ISD::FMAXNUM); - setTargetDAGCombine(ISD::SMIN); - setTargetDAGCombine(ISD::SMAX); - setTargetDAGCombine(ISD::UMIN); - setTargetDAGCombine(ISD::UMAX); - setTargetDAGCombine(ISD::SELECT_CC); - setTargetDAGCombine(ISD::SETCC); - setTargetDAGCombine(ISD::AND); - setTargetDAGCombine(ISD::OR); - setTargetDAGCombine(ISD::UINT_TO_FP); - - // All memory operations. Some folding on the pointer operand is done to help - // matching the constant offsets in the addressing modes. - setTargetDAGCombine(ISD::LOAD); - setTargetDAGCombine(ISD::STORE); - setTargetDAGCombine(ISD::ATOMIC_LOAD); - setTargetDAGCombine(ISD::ATOMIC_STORE); - setTargetDAGCombine(ISD::ATOMIC_CMP_SWAP); - setTargetDAGCombine(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS); - setTargetDAGCombine(ISD::ATOMIC_SWAP); - setTargetDAGCombine(ISD::ATOMIC_LOAD_ADD); - setTargetDAGCombine(ISD::ATOMIC_LOAD_SUB); - setTargetDAGCombine(ISD::ATOMIC_LOAD_AND); - setTargetDAGCombine(ISD::ATOMIC_LOAD_OR); - setTargetDAGCombine(ISD::ATOMIC_LOAD_XOR); - setTargetDAGCombine(ISD::ATOMIC_LOAD_NAND); - setTargetDAGCombine(ISD::ATOMIC_LOAD_MIN); - setTargetDAGCombine(ISD::ATOMIC_LOAD_MAX); - setTargetDAGCombine(ISD::ATOMIC_LOAD_UMIN); - setTargetDAGCombine(ISD::ATOMIC_LOAD_UMAX); - - setSchedulingPreference(Sched::RegPressure); -} - -//===----------------------------------------------------------------------===// -// TargetLowering queries -//===----------------------------------------------------------------------===// - -bool SITargetLowering::isShuffleMaskLegal(const SmallVectorImpl &, - EVT) const { - // SI has some legal vector types, but no legal vector operations. Say no - // shuffles are legal in order to prefer scalarizing some vector operations. - return false; -} - -bool SITargetLowering::isLegalAddressingMode(const AddrMode &AM, - Type *Ty, unsigned AS) const { - // No global is ever allowed as a base. - if (AM.BaseGV) - return false; - - switch (AS) { - case AMDGPUAS::GLOBAL_ADDRESS: - case AMDGPUAS::CONSTANT_ADDRESS: // XXX - Should we assume SMRD instructions? - case AMDGPUAS::PRIVATE_ADDRESS: - case AMDGPUAS::UNKNOWN_ADDRESS_SPACE: { - // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and - // additionally can do r + r + i with addr64. 32-bit has more addressing - // mode options. Depending on the resource constant, it can also do - // (i64 r0) + (i32 r1) * (i14 i). - // - // SMRD instructions have an 8-bit, dword offset. - // - // Assume nonunifom access, since the address space isn't enough to know - // what instruction we will use, and since we don't know if this is a load - // or store and scalar stores are only available on VI. - // - // We also know if we are doing an extload, we can't do a scalar load. - // - // Private arrays end up using a scratch buffer most of the time, so also - // assume those use MUBUF instructions. Scratch loads / stores are currently - // implemented as mubuf instructions with offen bit set, so slightly - // different than the normal addr64. - if (!isUInt<12>(AM.BaseOffs)) - return false; - - // FIXME: Since we can split immediate into soffset and immediate offset, - // would it make sense to allow any immediate? - - switch (AM.Scale) { - case 0: // r + i or just i, depending on HasBaseReg. - return true; - case 1: - return true; // We have r + r or r + i. - case 2: - if (AM.HasBaseReg) { - // Reject 2 * r + r. - return false; - } - - // Allow 2 * r as r + r - // Or 2 * r + i is allowed as r + r + i. - return true; - default: // Don't allow n * r - return false; - } - } - case AMDGPUAS::LOCAL_ADDRESS: - case AMDGPUAS::REGION_ADDRESS: { - // Basic, single offset DS instructions allow a 16-bit unsigned immediate - // field. - // XXX - If doing a 4-byte aligned 8-byte type access, we effectively have - // an 8-bit dword offset but we don't know the alignment here. - if (!isUInt<16>(AM.BaseOffs)) - return false; - - if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg. - return true; - - if (AM.Scale == 1 && AM.HasBaseReg) - return true; - - return false; - } - case AMDGPUAS::FLAT_ADDRESS: { - // Flat instructions do not have offsets, and only have the register - // address. - return AM.BaseOffs == 0 && (AM.Scale == 0 || AM.Scale == 1); - } - default: - llvm_unreachable("unhandled address space"); - } -} - -bool SITargetLowering::allowsMisalignedMemoryAccesses(EVT VT, - unsigned AddrSpace, - unsigned Align, - bool *IsFast) const { - if (IsFast) - *IsFast = false; - - // TODO: I think v3i32 should allow unaligned accesses on CI with DS_READ_B96, - // which isn't a simple VT. - if (!VT.isSimple() || VT == MVT::Other) - return false; - - // TODO - CI+ supports unaligned memory accesses, but this requires driver - // support. - - // XXX - The only mention I see of this in the ISA manual is for LDS direct - // reads the "byte address and must be dword aligned". Is it also true for the - // normal loads and stores? - if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS) { - // ds_read/write_b64 require 8-byte alignment, but we can do a 4 byte - // aligned, 8 byte access in a single operation using ds_read2/write2_b32 - // with adjacent offsets. - return Align % 4 == 0; - } - - // Smaller than dword value must be aligned. - // FIXME: This should be allowed on CI+ - if (VT.bitsLT(MVT::i32)) - return false; - - // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the - // byte-address are ignored, thus forcing Dword alignment. - // This applies to private, global, and constant memory. - if (IsFast) - *IsFast = true; - - return VT.bitsGT(MVT::i32) && Align % 4 == 0; -} - -EVT SITargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign, - unsigned SrcAlign, bool IsMemset, - bool ZeroMemset, - bool MemcpyStrSrc, - MachineFunction &MF) const { - // FIXME: Should account for address space here. - - // The default fallback uses the private pointer size as a guess for a type to - // use. Make sure we switch these to 64-bit accesses. - - if (Size >= 16 && DstAlign >= 4) // XXX: Should only do for global - return MVT::v4i32; - - if (Size >= 8 && DstAlign >= 4) - return MVT::v2i32; - - // Use the default. - return MVT::Other; -} - -TargetLoweringBase::LegalizeTypeAction -SITargetLowering::getPreferredVectorAction(EVT VT) const { - if (VT.getVectorNumElements() != 1 && VT.getScalarType().bitsLE(MVT::i16)) - return TypeSplitVector; - - return TargetLoweringBase::getPreferredVectorAction(VT); -} - -bool SITargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm, - Type *Ty) const { - const SIInstrInfo *TII = - static_cast(Subtarget->getInstrInfo()); - return TII->isInlineConstant(Imm); -} - -static EVT toIntegerVT(EVT VT) { - if (VT.isVector()) - return VT.changeVectorElementTypeToInteger(); - return MVT::getIntegerVT(VT.getSizeInBits()); -} - -SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT, - SDLoc SL, SDValue Chain, - unsigned Offset, bool Signed) const { - const DataLayout *DL = getDataLayout(); - MachineFunction &MF = DAG.getMachineFunction(); - const SIRegisterInfo *TRI = - static_cast(Subtarget->getRegisterInfo()); - unsigned InputPtrReg = TRI->getPreloadedValue(MF, SIRegisterInfo::INPUT_PTR); - - Type *Ty = VT.getTypeForEVT(*DAG.getContext()); - - MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); - MVT PtrVT = getPointerTy(AMDGPUAS::CONSTANT_ADDRESS); - PointerType *PtrTy = PointerType::get(Ty, AMDGPUAS::CONSTANT_ADDRESS); - SDValue BasePtr = DAG.getCopyFromReg(Chain, SL, - MRI.getLiveInVirtReg(InputPtrReg), PtrVT); - SDValue Ptr = DAG.getNode(ISD::ADD, SL, PtrVT, BasePtr, - DAG.getConstant(Offset, SL, PtrVT)); - SDValue PtrOffset = DAG.getUNDEF(getPointerTy(AMDGPUAS::CONSTANT_ADDRESS)); - MachinePointerInfo PtrInfo(UndefValue::get(PtrTy)); - - unsigned Align = DL->getABITypeAlignment(Ty); - - if (VT != MemVT && VT.isFloatingPoint()) { - // Do an integer load and convert. - // FIXME: This is mostly because load legalization after type legalization - // doesn't handle FP extloads. - assert(VT.getScalarType() == MVT::f32 && - MemVT.getScalarType() == MVT::f16); - - EVT IVT = toIntegerVT(VT); - EVT MemIVT = toIntegerVT(MemVT); - SDValue Load = DAG.getLoad(ISD::UNINDEXED, ISD::ZEXTLOAD, - IVT, SL, Chain, Ptr, PtrOffset, PtrInfo, MemIVT, - false, // isVolatile - true, // isNonTemporal - true, // isInvariant - Align); // Alignment - return DAG.getNode(ISD::FP16_TO_FP, SL, VT, Load); - } - - ISD::LoadExtType ExtTy = Signed ? ISD::SEXTLOAD : ISD::ZEXTLOAD; - return DAG.getLoad(ISD::UNINDEXED, ExtTy, - VT, SL, Chain, Ptr, PtrOffset, PtrInfo, MemVT, - false, // isVolatile - true, // isNonTemporal - true, // isInvariant - Align); // Alignment -} - -SDValue SITargetLowering::LowerFormalArguments( - SDValue Chain, CallingConv::ID CallConv, bool isVarArg, - const SmallVectorImpl &Ins, SDLoc DL, SelectionDAG &DAG, - SmallVectorImpl &InVals) const { - const SIRegisterInfo *TRI = - static_cast(Subtarget->getRegisterInfo()); - - MachineFunction &MF = DAG.getMachineFunction(); - FunctionType *FType = MF.getFunction()->getFunctionType(); - SIMachineFunctionInfo *Info = MF.getInfo(); - - assert(CallConv == CallingConv::C); - - SmallVector Splits; - BitVector Skipped(Ins.size()); - - for (unsigned i = 0, e = Ins.size(), PSInputNum = 0; i != e; ++i) { - const ISD::InputArg &Arg = Ins[i]; - - // First check if it's a PS input addr - if (Info->getShaderType() == ShaderType::PIXEL && !Arg.Flags.isInReg() && - !Arg.Flags.isByVal()) { - - assert((PSInputNum <= 15) && "Too many PS inputs!"); - - if (!Arg.Used) { - // We can savely skip PS inputs - Skipped.set(i); - ++PSInputNum; - continue; - } - - Info->PSInputAddr |= 1 << PSInputNum++; - } - - // Second split vertices into their elements - if (Info->getShaderType() != ShaderType::COMPUTE && Arg.VT.isVector()) { - ISD::InputArg NewArg = Arg; - NewArg.Flags.setSplit(); - NewArg.VT = Arg.VT.getVectorElementType(); - - // We REALLY want the ORIGINAL number of vertex elements here, e.g. a - // three or five element vertex only needs three or five registers, - // NOT four or eigth. - Type *ParamType = FType->getParamType(Arg.getOrigArgIndex()); - unsigned NumElements = ParamType->getVectorNumElements(); - - for (unsigned j = 0; j != NumElements; ++j) { - Splits.push_back(NewArg); - NewArg.PartOffset += NewArg.VT.getStoreSize(); - } - - } else if (Info->getShaderType() != ShaderType::COMPUTE) { - Splits.push_back(Arg); - } - } - - SmallVector ArgLocs; - CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs, - *DAG.getContext()); - - // At least one interpolation mode must be enabled or else the GPU will hang. - if (Info->getShaderType() == ShaderType::PIXEL && - (Info->PSInputAddr & 0x7F) == 0) { - Info->PSInputAddr |= 1; - CCInfo.AllocateReg(AMDGPU::VGPR0); - CCInfo.AllocateReg(AMDGPU::VGPR1); - } - - // The pointer to the list of arguments is stored in SGPR0, SGPR1 - // The pointer to the scratch buffer is stored in SGPR2, SGPR3 - if (Info->getShaderType() == ShaderType::COMPUTE) { - if (Subtarget->isAmdHsaOS()) - Info->NumUserSGPRs = 2; // FIXME: Need to support scratch buffers. - else - Info->NumUserSGPRs = 4; - - unsigned InputPtrReg = - TRI->getPreloadedValue(MF, SIRegisterInfo::INPUT_PTR); - unsigned InputPtrRegLo = - TRI->getPhysRegSubReg(InputPtrReg, &AMDGPU::SReg_32RegClass, 0); - unsigned InputPtrRegHi = - TRI->getPhysRegSubReg(InputPtrReg, &AMDGPU::SReg_32RegClass, 1); - - unsigned ScratchPtrReg = - TRI->getPreloadedValue(MF, SIRegisterInfo::SCRATCH_PTR); - unsigned ScratchPtrRegLo = - TRI->getPhysRegSubReg(ScratchPtrReg, &AMDGPU::SReg_32RegClass, 0); - unsigned ScratchPtrRegHi = - TRI->getPhysRegSubReg(ScratchPtrReg, &AMDGPU::SReg_32RegClass, 1); - - CCInfo.AllocateReg(InputPtrRegLo); - CCInfo.AllocateReg(InputPtrRegHi); - CCInfo.AllocateReg(ScratchPtrRegLo); - CCInfo.AllocateReg(ScratchPtrRegHi); - MF.addLiveIn(InputPtrReg, &AMDGPU::SReg_64RegClass); - MF.addLiveIn(ScratchPtrReg, &AMDGPU::SReg_64RegClass); - } - - if (Info->getShaderType() == ShaderType::COMPUTE) { - getOriginalFunctionArgs(DAG, DAG.getMachineFunction().getFunction(), Ins, - Splits); - } - - AnalyzeFormalArguments(CCInfo, Splits); - - for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) { - - const ISD::InputArg &Arg = Ins[i]; - if (Skipped[i]) { - InVals.push_back(DAG.getUNDEF(Arg.VT)); - continue; - } - - CCValAssign &VA = ArgLocs[ArgIdx++]; - MVT VT = VA.getLocVT(); - - if (VA.isMemLoc()) { - VT = Ins[i].VT; - EVT MemVT = Splits[i].VT; - const unsigned Offset = 36 + VA.getLocMemOffset(); - // The first 36 bytes of the input buffer contains information about - // thread group and global sizes. - SDValue Arg = LowerParameter(DAG, VT, MemVT, DL, DAG.getRoot(), - Offset, Ins[i].Flags.isSExt()); - - const PointerType *ParamTy = - dyn_cast(FType->getParamType(Ins[i].getOrigArgIndex())); - if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS && - ParamTy && ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) { - // On SI local pointers are just offsets into LDS, so they are always - // less than 16-bits. On CI and newer they could potentially be - // real pointers, so we can't guarantee their size. - Arg = DAG.getNode(ISD::AssertZext, DL, Arg.getValueType(), Arg, - DAG.getValueType(MVT::i16)); - } - - InVals.push_back(Arg); - Info->ABIArgOffset = Offset + MemVT.getStoreSize(); - continue; - } - assert(VA.isRegLoc() && "Parameter must be in a register!"); - - unsigned Reg = VA.getLocReg(); - - if (VT == MVT::i64) { - // For now assume it is a pointer - Reg = TRI->getMatchingSuperReg(Reg, AMDGPU::sub0, - &AMDGPU::SReg_64RegClass); - Reg = MF.addLiveIn(Reg, &AMDGPU::SReg_64RegClass); - InVals.push_back(DAG.getCopyFromReg(Chain, DL, Reg, VT)); - continue; - } - - const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, VT); - - Reg = MF.addLiveIn(Reg, RC); - SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT); - - if (Arg.VT.isVector()) { - - // Build a vector from the registers - Type *ParamType = FType->getParamType(Arg.getOrigArgIndex()); - unsigned NumElements = ParamType->getVectorNumElements(); - - SmallVector Regs; - Regs.push_back(Val); - for (unsigned j = 1; j != NumElements; ++j) { - Reg = ArgLocs[ArgIdx++].getLocReg(); - Reg = MF.addLiveIn(Reg, RC); - Regs.push_back(DAG.getCopyFromReg(Chain, DL, Reg, VT)); - } - - // Fill up the missing vector elements - NumElements = Arg.VT.getVectorNumElements() - NumElements; - Regs.append(NumElements, DAG.getUNDEF(VT)); - - InVals.push_back(DAG.getNode(ISD::BUILD_VECTOR, DL, Arg.VT, Regs)); - continue; - } - - InVals.push_back(Val); - } - - if (Info->getShaderType() != ShaderType::COMPUTE) { - unsigned ScratchIdx = CCInfo.getFirstUnallocated(ArrayRef( - AMDGPU::SGPR_32RegClass.begin(), AMDGPU::SGPR_32RegClass.getNumRegs())); - Info->ScratchOffsetReg = AMDGPU::SGPR_32RegClass.getRegister(ScratchIdx); - } - return Chain; -} - -MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter( - MachineInstr * MI, MachineBasicBlock * BB) const { - - MachineBasicBlock::iterator I = *MI; - const SIInstrInfo *TII = - static_cast(Subtarget->getInstrInfo()); - - switch (MI->getOpcode()) { - default: - return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB); - case AMDGPU::BRANCH: - return BB; - case AMDGPU::SI_RegisterStorePseudo: { - MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); - unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); - MachineInstrBuilder MIB = - BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::SI_RegisterStore), - Reg); - for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) - MIB.addOperand(MI->getOperand(i)); - - MI->eraseFromParent(); - break; - } - } - return BB; -} - -bool SITargetLowering::enableAggressiveFMAFusion(EVT VT) const { - // This currently forces unfolding various combinations of fsub into fma with - // free fneg'd operands. As long as we have fast FMA (controlled by - // isFMAFasterThanFMulAndFAdd), we should perform these. - - // When fma is quarter rate, for f64 where add / sub are at best half rate, - // most of these combines appear to be cycle neutral but save on instruction - // count / code size. - return true; -} - -EVT SITargetLowering::getSetCCResultType(LLVMContext &Ctx, EVT VT) const { - if (!VT.isVector()) { - return MVT::i1; - } - return EVT::getVectorVT(Ctx, MVT::i1, VT.getVectorNumElements()); -} - -MVT SITargetLowering::getScalarShiftAmountTy(EVT VT) const { - return MVT::i32; -} - -// Answering this is somewhat tricky and depends on the specific device which -// have different rates for fma or all f64 operations. -// -// v_fma_f64 and v_mul_f64 always take the same number of cycles as each other -// regardless of which device (although the number of cycles differs between -// devices), so it is always profitable for f64. -// -// v_fma_f32 takes 4 or 16 cycles depending on the device, so it is profitable -// only on full rate devices. Normally, we should prefer selecting v_mad_f32 -// which we can always do even without fused FP ops since it returns the same -// result as the separate operations and since it is always full -// rate. Therefore, we lie and report that it is not faster for f32. v_mad_f32 -// however does not support denormals, so we do report fma as faster if we have -// a fast fma device and require denormals. -// -bool SITargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const { - VT = VT.getScalarType(); - - if (!VT.isSimple()) - return false; - - switch (VT.getSimpleVT().SimpleTy) { - case MVT::f32: - // This is as fast on some subtargets. However, we always have full rate f32 - // mad available which returns the same result as the separate operations - // which we should prefer over fma. We can't use this if we want to support - // denormals, so only report this in these cases. - return Subtarget->hasFP32Denormals() && Subtarget->hasFastFMAF32(); - case MVT::f64: - return true; - default: - break; - } - - return false; -} - -//===----------------------------------------------------------------------===// -// Custom DAG Lowering Operations -//===----------------------------------------------------------------------===// - -SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { - switch (Op.getOpcode()) { - default: return AMDGPUTargetLowering::LowerOperation(Op, DAG); - case ISD::FrameIndex: return LowerFrameIndex(Op, DAG); - case ISD::BRCOND: return LowerBRCOND(Op, DAG); - case ISD::LOAD: { - SDValue Result = LowerLOAD(Op, DAG); - assert((!Result.getNode() || - Result.getNode()->getNumValues() == 2) && - "Load should return a value and a chain"); - return Result; - } - - case ISD::FSIN: - case ISD::FCOS: - return LowerTrig(Op, DAG); - case ISD::SELECT: return LowerSELECT(Op, DAG); - case ISD::FDIV: return LowerFDIV(Op, DAG); - case ISD::STORE: return LowerSTORE(Op, DAG); - case ISD::GlobalAddress: { - MachineFunction &MF = DAG.getMachineFunction(); - SIMachineFunctionInfo *MFI = MF.getInfo(); - return LowerGlobalAddress(MFI, Op, DAG); - } - case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG); - case ISD::INTRINSIC_VOID: return LowerINTRINSIC_VOID(Op, DAG); - } - return SDValue(); -} - -/// \brief Helper function for LowerBRCOND -static SDNode *findUser(SDValue Value, unsigned Opcode) { - - SDNode *Parent = Value.getNode(); - for (SDNode::use_iterator I = Parent->use_begin(), E = Parent->use_end(); - I != E; ++I) { - - if (I.getUse().get() != Value) - continue; - - if (I->getOpcode() == Opcode) - return *I; - } - return nullptr; -} - -SDValue SITargetLowering::LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const { - - FrameIndexSDNode *FINode = cast(Op); - unsigned FrameIndex = FINode->getIndex(); - - return DAG.getTargetFrameIndex(FrameIndex, MVT::i32); -} - -/// This transforms the control flow intrinsics to get the branch destination as -/// last parameter, also switches branch target with BR if the need arise -SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND, - SelectionDAG &DAG) const { - - SDLoc DL(BRCOND); - - SDNode *Intr = BRCOND.getOperand(1).getNode(); - SDValue Target = BRCOND.getOperand(2); - SDNode *BR = nullptr; - - if (Intr->getOpcode() == ISD::SETCC) { - // As long as we negate the condition everything is fine - SDNode *SetCC = Intr; - assert(SetCC->getConstantOperandVal(1) == 1); - assert(cast(SetCC->getOperand(2).getNode())->get() == - ISD::SETNE); - Intr = SetCC->getOperand(0).getNode(); - - } else { - // Get the target from BR if we don't negate the condition - BR = findUser(BRCOND, ISD::BR); - Target = BR->getOperand(1); - } - - assert(Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN); - - // Build the result and - ArrayRef Res(Intr->value_begin() + 1, Intr->value_end()); - - // operands of the new intrinsic call - SmallVector Ops; - Ops.push_back(BRCOND.getOperand(0)); - Ops.append(Intr->op_begin() + 1, Intr->op_end()); - Ops.push_back(Target); - - // build the new intrinsic call - SDNode *Result = DAG.getNode( - Res.size() > 1 ? ISD::INTRINSIC_W_CHAIN : ISD::INTRINSIC_VOID, DL, - DAG.getVTList(Res), Ops).getNode(); - - if (BR) { - // Give the branch instruction our target - SDValue Ops[] = { - BR->getOperand(0), - BRCOND.getOperand(2) - }; - SDValue NewBR = DAG.getNode(ISD::BR, DL, BR->getVTList(), Ops); - DAG.ReplaceAllUsesWith(BR, NewBR.getNode()); - BR = NewBR.getNode(); - } - - SDValue Chain = SDValue(Result, Result->getNumValues() - 1); - - // Copy the intrinsic results to registers - for (unsigned i = 1, e = Intr->getNumValues() - 1; i != e; ++i) { - SDNode *CopyToReg = findUser(SDValue(Intr, i), ISD::CopyToReg); - if (!CopyToReg) - continue; - - Chain = DAG.getCopyToReg( - Chain, DL, - CopyToReg->getOperand(1), - SDValue(Result, i - 1), - SDValue()); - - DAG.ReplaceAllUsesWith(SDValue(CopyToReg, 0), CopyToReg->getOperand(0)); - } - - // Remove the old intrinsic from the chain - DAG.ReplaceAllUsesOfValueWith( - SDValue(Intr, Intr->getNumValues() - 1), - Intr->getOperand(0)); - - return Chain; -} - -SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI, - SDValue Op, - SelectionDAG &DAG) const { - GlobalAddressSDNode *GSD = cast(Op); - - if (GSD->getAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS) - return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG); - - SDLoc DL(GSD); - const GlobalValue *GV = GSD->getGlobal(); - MVT PtrVT = getPointerTy(GSD->getAddressSpace()); - - SDValue Ptr = DAG.getNode(AMDGPUISD::CONST_DATA_PTR, DL, PtrVT); - SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32); - - SDValue PtrLo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, Ptr, - DAG.getConstant(0, DL, MVT::i32)); - SDValue PtrHi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, Ptr, - DAG.getConstant(1, DL, MVT::i32)); - - SDValue Lo = DAG.getNode(ISD::ADDC, DL, DAG.getVTList(MVT::i32, MVT::Glue), - PtrLo, GA); - SDValue Hi = DAG.getNode(ISD::ADDE, DL, DAG.getVTList(MVT::i32, MVT::Glue), - PtrHi, DAG.getConstant(0, DL, MVT::i32), - SDValue(Lo.getNode(), 1)); - return DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Lo, Hi); -} - -SDValue SITargetLowering::copyToM0(SelectionDAG &DAG, SDValue Chain, SDLoc DL, - SDValue V) const { - // We can't use CopyToReg, because MachineCSE won't combine COPY instructions, - // so we will end up with redundant moves to m0. - // - // We can't use S_MOV_B32, because there is no way to specify m0 as the - // destination register. - // - // We have to use them both. Machine cse will combine all the S_MOV_B32 - // instructions and the register coalescer eliminate the extra copies. - SDNode *M0 = DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, V.getValueType(), V); - return DAG.getCopyToReg(Chain, DL, DAG.getRegister(AMDGPU::M0, MVT::i32), - SDValue(M0, 0), SDValue()); // Glue - // A Null SDValue creates - // a glue result. -} - -SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, - SelectionDAG &DAG) const { - MachineFunction &MF = DAG.getMachineFunction(); - const SIRegisterInfo *TRI = - static_cast(Subtarget->getRegisterInfo()); - - EVT VT = Op.getValueType(); - SDLoc DL(Op); - unsigned IntrinsicID = cast(Op.getOperand(0))->getZExtValue(); - - switch (IntrinsicID) { - case Intrinsic::r600_read_ngroups_x: - return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), - SI::KernelInputOffsets::NGROUPS_X, false); - case Intrinsic::r600_read_ngroups_y: - return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), - SI::KernelInputOffsets::NGROUPS_Y, false); - case Intrinsic::r600_read_ngroups_z: - return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), - SI::KernelInputOffsets::NGROUPS_Z, false); - case Intrinsic::r600_read_global_size_x: - return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), - SI::KernelInputOffsets::GLOBAL_SIZE_X, false); - case Intrinsic::r600_read_global_size_y: - return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), - SI::KernelInputOffsets::GLOBAL_SIZE_Y, false); - case Intrinsic::r600_read_global_size_z: - return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), - SI::KernelInputOffsets::GLOBAL_SIZE_Z, false); - case Intrinsic::r600_read_local_size_x: - return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), - SI::KernelInputOffsets::LOCAL_SIZE_X, false); - case Intrinsic::r600_read_local_size_y: - return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), - SI::KernelInputOffsets::LOCAL_SIZE_Y, false); - case Intrinsic::r600_read_local_size_z: - return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), - SI::KernelInputOffsets::LOCAL_SIZE_Z, false); - - case Intrinsic::AMDGPU_read_workdim: - return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), - MF.getInfo()->ABIArgOffset, - false); - - case Intrinsic::r600_read_tgid_x: - return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass, - TRI->getPreloadedValue(MF, SIRegisterInfo::TGID_X), VT); - case Intrinsic::r600_read_tgid_y: - return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass, - TRI->getPreloadedValue(MF, SIRegisterInfo::TGID_Y), VT); - case Intrinsic::r600_read_tgid_z: - return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass, - TRI->getPreloadedValue(MF, SIRegisterInfo::TGID_Z), VT); - case Intrinsic::r600_read_tidig_x: - return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass, - TRI->getPreloadedValue(MF, SIRegisterInfo::TIDIG_X), VT); - case Intrinsic::r600_read_tidig_y: - return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass, - TRI->getPreloadedValue(MF, SIRegisterInfo::TIDIG_Y), VT); - case Intrinsic::r600_read_tidig_z: - return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass, - TRI->getPreloadedValue(MF, SIRegisterInfo::TIDIG_Z), VT); - case AMDGPUIntrinsic::SI_load_const: { - SDValue Ops[] = { - Op.getOperand(1), - Op.getOperand(2) - }; - - MachineMemOperand *MMO = MF.getMachineMemOperand( - MachinePointerInfo(), - MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant, - VT.getStoreSize(), 4); - return DAG.getMemIntrinsicNode(AMDGPUISD::LOAD_CONSTANT, DL, - Op->getVTList(), Ops, VT, MMO); - } - case AMDGPUIntrinsic::SI_sample: - return LowerSampleIntrinsic(AMDGPUISD::SAMPLE, Op, DAG); - case AMDGPUIntrinsic::SI_sampleb: - return LowerSampleIntrinsic(AMDGPUISD::SAMPLEB, Op, DAG); - case AMDGPUIntrinsic::SI_sampled: - return LowerSampleIntrinsic(AMDGPUISD::SAMPLED, Op, DAG); - case AMDGPUIntrinsic::SI_samplel: - return LowerSampleIntrinsic(AMDGPUISD::SAMPLEL, Op, DAG); - case AMDGPUIntrinsic::SI_vs_load_input: - return DAG.getNode(AMDGPUISD::LOAD_INPUT, DL, VT, - Op.getOperand(1), - Op.getOperand(2), - Op.getOperand(3)); - - case AMDGPUIntrinsic::AMDGPU_fract: - case AMDGPUIntrinsic::AMDIL_fraction: // Legacy name. - return DAG.getNode(ISD::FSUB, DL, VT, Op.getOperand(1), - DAG.getNode(ISD::FFLOOR, DL, VT, Op.getOperand(1))); - case AMDGPUIntrinsic::SI_fs_constant: { - SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(3)); - SDValue Glue = M0.getValue(1); - return DAG.getNode(AMDGPUISD::INTERP_MOV, DL, MVT::f32, - DAG.getConstant(2, DL, MVT::i32), // P0 - Op.getOperand(1), Op.getOperand(2), Glue); - } - case AMDGPUIntrinsic::SI_fs_interp: { - SDValue IJ = Op.getOperand(4); - SDValue I = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, IJ, - DAG.getConstant(0, DL, MVT::i32)); - SDValue J = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, IJ, - DAG.getConstant(1, DL, MVT::i32)); - SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(3)); - SDValue Glue = M0.getValue(1); - SDValue P1 = DAG.getNode(AMDGPUISD::INTERP_P1, DL, - DAG.getVTList(MVT::f32, MVT::Glue), - I, Op.getOperand(1), Op.getOperand(2), Glue); - Glue = SDValue(P1.getNode(), 1); - return DAG.getNode(AMDGPUISD::INTERP_P2, DL, MVT::f32, P1, J, - Op.getOperand(1), Op.getOperand(2), Glue); - } - default: - return AMDGPUTargetLowering::LowerOperation(Op, DAG); - } -} - -SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, - SelectionDAG &DAG) const { - MachineFunction &MF = DAG.getMachineFunction(); - SDLoc DL(Op); - SDValue Chain = Op.getOperand(0); - unsigned IntrinsicID = cast(Op.getOperand(1))->getZExtValue(); - - switch (IntrinsicID) { - case AMDGPUIntrinsic::SI_sendmsg: { - Chain = copyToM0(DAG, Chain, DL, Op.getOperand(3)); - SDValue Glue = Chain.getValue(1); - return DAG.getNode(AMDGPUISD::SENDMSG, DL, MVT::Other, Chain, - Op.getOperand(2), Glue); - } - case AMDGPUIntrinsic::SI_tbuffer_store: { - SDValue Ops[] = { - Chain, - Op.getOperand(2), - Op.getOperand(3), - Op.getOperand(4), - Op.getOperand(5), - Op.getOperand(6), - Op.getOperand(7), - Op.getOperand(8), - Op.getOperand(9), - Op.getOperand(10), - Op.getOperand(11), - Op.getOperand(12), - Op.getOperand(13), - Op.getOperand(14) - }; - - EVT VT = Op.getOperand(3).getValueType(); - - MachineMemOperand *MMO = MF.getMachineMemOperand( - MachinePointerInfo(), - MachineMemOperand::MOStore, - VT.getStoreSize(), 4); - return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_STORE_FORMAT, DL, - Op->getVTList(), Ops, VT, MMO); - } - default: - return SDValue(); - } -} - -SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { - SDLoc DL(Op); - LoadSDNode *Load = cast(Op); - - if (Op.getValueType().isVector()) { - assert(Op.getValueType().getVectorElementType() == MVT::i32 && - "Custom lowering for non-i32 vectors hasn't been implemented."); - unsigned NumElements = Op.getValueType().getVectorNumElements(); - assert(NumElements != 2 && "v2 loads are supported for all address spaces."); - switch (Load->getAddressSpace()) { - default: break; - case AMDGPUAS::GLOBAL_ADDRESS: - case AMDGPUAS::PRIVATE_ADDRESS: - // v4 loads are supported for private and global memory. - if (NumElements <= 4) - break; - // fall-through - case AMDGPUAS::LOCAL_ADDRESS: - return ScalarizeVectorLoad(Op, DAG); - } - } - - return AMDGPUTargetLowering::LowerLOAD(Op, DAG); -} - -SDValue SITargetLowering::LowerSampleIntrinsic(unsigned Opcode, - const SDValue &Op, - SelectionDAG &DAG) const { - return DAG.getNode(Opcode, SDLoc(Op), Op.getValueType(), Op.getOperand(1), - Op.getOperand(2), - Op.getOperand(3), - Op.getOperand(4)); -} - -SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { - if (Op.getValueType() != MVT::i64) - return SDValue(); - - SDLoc DL(Op); - SDValue Cond = Op.getOperand(0); - - SDValue Zero = DAG.getConstant(0, DL, MVT::i32); - SDValue One = DAG.getConstant(1, DL, MVT::i32); - - SDValue LHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1)); - SDValue RHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(2)); - - SDValue Lo0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, Zero); - SDValue Lo1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, Zero); - - SDValue Lo = DAG.getSelect(DL, MVT::i32, Cond, Lo0, Lo1); - - SDValue Hi0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, One); - SDValue Hi1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, One); - - SDValue Hi = DAG.getSelect(DL, MVT::i32, Cond, Hi0, Hi1); - - SDValue Res = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2i32, Lo, Hi); - return DAG.getNode(ISD::BITCAST, DL, MVT::i64, Res); -} - -// Catch division cases where we can use shortcuts with rcp and rsq -// instructions. -SDValue SITargetLowering::LowerFastFDIV(SDValue Op, SelectionDAG &DAG) const { - SDLoc SL(Op); - SDValue LHS = Op.getOperand(0); - SDValue RHS = Op.getOperand(1); - EVT VT = Op.getValueType(); - bool Unsafe = DAG.getTarget().Options.UnsafeFPMath; - - if (const ConstantFPSDNode *CLHS = dyn_cast(LHS)) { - if ((Unsafe || (VT == MVT::f32 && !Subtarget->hasFP32Denormals())) && - CLHS->isExactlyValue(1.0)) { - // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to - // the CI documentation has a worst case error of 1 ulp. - // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to - // use it as long as we aren't trying to use denormals. - - // 1.0 / sqrt(x) -> rsq(x) - // - // XXX - Is UnsafeFPMath sufficient to do this for f64? The maximum ULP - // error seems really high at 2^29 ULP. - if (RHS.getOpcode() == ISD::FSQRT) - return DAG.getNode(AMDGPUISD::RSQ, SL, VT, RHS.getOperand(0)); - - // 1.0 / x -> rcp(x) - return DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS); - } - } - - if (Unsafe) { - // Turn into multiply by the reciprocal. - // x / y -> x * (1.0 / y) - SDValue Recip = DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS); - return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip); - } - - return SDValue(); -} - -SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const { - SDValue FastLowered = LowerFastFDIV(Op, DAG); - if (FastLowered.getNode()) - return FastLowered; - - // This uses v_rcp_f32 which does not handle denormals. Let this hit a - // selection error for now rather than do something incorrect. - if (Subtarget->hasFP32Denormals()) - return SDValue(); - - SDLoc SL(Op); - SDValue LHS = Op.getOperand(0); - SDValue RHS = Op.getOperand(1); - - SDValue r1 = DAG.getNode(ISD::FABS, SL, MVT::f32, RHS); - - const APFloat K0Val(BitsToFloat(0x6f800000)); - const SDValue K0 = DAG.getConstantFP(K0Val, SL, MVT::f32); - - const APFloat K1Val(BitsToFloat(0x2f800000)); - const SDValue K1 = DAG.getConstantFP(K1Val, SL, MVT::f32); - - const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32); - - EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::f32); - - SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT); - - SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One); - - r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3); - - SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1); - - SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0); - - return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul); -} - -SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const { - if (DAG.getTarget().Options.UnsafeFPMath) - return LowerFastFDIV(Op, DAG); - - SDLoc SL(Op); - SDValue X = Op.getOperand(0); - SDValue Y = Op.getOperand(1); - - const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64); - - SDVTList ScaleVT = DAG.getVTList(MVT::f64, MVT::i1); - - SDValue DivScale0 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, Y, Y, X); - - SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f64, DivScale0); - - SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f64, DivScale0); - - SDValue Fma0 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Rcp, One); - - SDValue Fma1 = DAG.getNode(ISD::FMA, SL, MVT::f64, Rcp, Fma0, Rcp); - - SDValue Fma2 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Fma1, One); - - SDValue DivScale1 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, X, Y, X); - - SDValue Fma3 = DAG.getNode(ISD::FMA, SL, MVT::f64, Fma1, Fma2, Fma1); - SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, DivScale1, Fma3); - - SDValue Fma4 = DAG.getNode(ISD::FMA, SL, MVT::f64, - NegDivScale0, Mul, DivScale1); - - SDValue Scale; - - if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) { - // Workaround a hardware bug on SI where the condition output from div_scale - // is not usable. - - const SDValue Hi = DAG.getConstant(1, SL, MVT::i32); - - // Figure out if the scale to use for div_fmas. - SDValue NumBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X); - SDValue DenBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Y); - SDValue Scale0BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale0); - SDValue Scale1BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale1); - - SDValue NumHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, NumBC, Hi); - SDValue DenHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, DenBC, Hi); - - SDValue Scale0Hi - = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale0BC, Hi); - SDValue Scale1Hi - = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale1BC, Hi); - - SDValue CmpDen = DAG.getSetCC(SL, MVT::i1, DenHi, Scale0Hi, ISD::SETEQ); - SDValue CmpNum = DAG.getSetCC(SL, MVT::i1, NumHi, Scale1Hi, ISD::SETEQ); - Scale = DAG.getNode(ISD::XOR, SL, MVT::i1, CmpNum, CmpDen); - } else { - Scale = DivScale1.getValue(1); - } - - SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f64, - Fma4, Fma3, Mul, Scale); - - return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f64, Fmas, Y, X); -} - -SDValue SITargetLowering::LowerFDIV(SDValue Op, SelectionDAG &DAG) const { - EVT VT = Op.getValueType(); - - if (VT == MVT::f32) - return LowerFDIV32(Op, DAG); - - if (VT == MVT::f64) - return LowerFDIV64(Op, DAG); - - llvm_unreachable("Unexpected type for fdiv"); -} - -SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { - SDLoc DL(Op); - StoreSDNode *Store = cast(Op); - EVT VT = Store->getMemoryVT(); - - // These stores are legal. - if (Store->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) { - if (VT.isVector() && VT.getVectorNumElements() > 4) - return ScalarizeVectorStore(Op, DAG); - return SDValue(); - } - - SDValue Ret = AMDGPUTargetLowering::LowerSTORE(Op, DAG); - if (Ret.getNode()) - return Ret; - - if (VT.isVector() && VT.getVectorNumElements() >= 8) - return ScalarizeVectorStore(Op, DAG); - - if (VT == MVT::i1) - return DAG.getTruncStore(Store->getChain(), DL, - DAG.getSExtOrTrunc(Store->getValue(), DL, MVT::i32), - Store->getBasePtr(), MVT::i1, Store->getMemOperand()); - - return SDValue(); -} - -SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const { - SDLoc DL(Op); - EVT VT = Op.getValueType(); - SDValue Arg = Op.getOperand(0); - SDValue FractPart = DAG.getNode(AMDGPUISD::FRACT, DL, VT, - DAG.getNode(ISD::FMUL, DL, VT, Arg, - DAG.getConstantFP(0.5/M_PI, DL, - VT))); - - switch (Op.getOpcode()) { - case ISD::FCOS: - return DAG.getNode(AMDGPUISD::COS_HW, SDLoc(Op), VT, FractPart); - case ISD::FSIN: - return DAG.getNode(AMDGPUISD::SIN_HW, SDLoc(Op), VT, FractPart); - default: - llvm_unreachable("Wrong trig opcode"); - } -} - -//===----------------------------------------------------------------------===// -// Custom DAG optimizations -//===----------------------------------------------------------------------===// - -SDValue SITargetLowering::performUCharToFloatCombine(SDNode *N, - DAGCombinerInfo &DCI) const { - EVT VT = N->getValueType(0); - EVT ScalarVT = VT.getScalarType(); - if (ScalarVT != MVT::f32) - return SDValue(); - - SelectionDAG &DAG = DCI.DAG; - SDLoc DL(N); - - SDValue Src = N->getOperand(0); - EVT SrcVT = Src.getValueType(); - - // TODO: We could try to match extracting the higher bytes, which would be - // easier if i8 vectors weren't promoted to i32 vectors, particularly after - // types are legalized. v4i8 -> v4f32 is probably the only case to worry - // about in practice. - if (DCI.isAfterLegalizeVectorOps() && SrcVT == MVT::i32) { - if (DAG.MaskedValueIsZero(Src, APInt::getHighBitsSet(32, 24))) { - SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, VT, Src); - DCI.AddToWorklist(Cvt.getNode()); - return Cvt; - } - } - - // We are primarily trying to catch operations on illegal vector types - // before they are expanded. - // For scalars, we can use the more flexible method of checking masked bits - // after legalization. - if (!DCI.isBeforeLegalize() || - !SrcVT.isVector() || - SrcVT.getVectorElementType() != MVT::i8) { - return SDValue(); - } - - assert(DCI.isBeforeLegalize() && "Unexpected legal type"); - - // Weird sized vectors are a pain to handle, but we know 3 is really the same - // size as 4. - unsigned NElts = SrcVT.getVectorNumElements(); - if (!SrcVT.isSimple() && NElts != 3) - return SDValue(); - - // Handle v4i8 -> v4f32 extload. Replace the v4i8 with a legal i32 load to - // prevent a mess from expanding to v4i32 and repacking. - if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) { - EVT LoadVT = getEquivalentMemType(*DAG.getContext(), SrcVT); - EVT RegVT = getEquivalentLoadRegType(*DAG.getContext(), SrcVT); - EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f32, NElts); - LoadSDNode *Load = cast(Src); - - unsigned AS = Load->getAddressSpace(); - unsigned Align = Load->getAlignment(); - Type *Ty = LoadVT.getTypeForEVT(*DAG.getContext()); - unsigned ABIAlignment = getDataLayout()->getABITypeAlignment(Ty); - - // Don't try to replace the load if we have to expand it due to alignment - // problems. Otherwise we will end up scalarizing the load, and trying to - // repack into the vector for no real reason. - if (Align < ABIAlignment && - !allowsMisalignedMemoryAccesses(LoadVT, AS, Align, nullptr)) { - return SDValue(); - } - - SDValue NewLoad = DAG.getExtLoad(ISD::ZEXTLOAD, DL, RegVT, - Load->getChain(), - Load->getBasePtr(), - LoadVT, - Load->getMemOperand()); - - // Make sure successors of the original load stay after it by updating - // them to use the new Chain. - DAG.ReplaceAllUsesOfValueWith(SDValue(Load, 1), NewLoad.getValue(1)); - - SmallVector Elts; - if (RegVT.isVector()) - DAG.ExtractVectorElements(NewLoad, Elts); - else - Elts.push_back(NewLoad); - - SmallVector Ops; - - unsigned EltIdx = 0; - for (SDValue Elt : Elts) { - unsigned ComponentsInElt = std::min(4u, NElts - 4 * EltIdx); - for (unsigned I = 0; I < ComponentsInElt; ++I) { - unsigned Opc = AMDGPUISD::CVT_F32_UBYTE0 + I; - SDValue Cvt = DAG.getNode(Opc, DL, MVT::f32, Elt); - DCI.AddToWorklist(Cvt.getNode()); - Ops.push_back(Cvt); - } - - ++EltIdx; - } - - assert(Ops.size() == NElts); - - return DAG.getNode(ISD::BUILD_VECTOR, DL, FloatVT, Ops); - } - - return SDValue(); -} - -/// \brief Return true if the given offset Size in bytes can be folded into -/// the immediate offsets of a memory instruction for the given address space. -static bool canFoldOffset(unsigned OffsetSize, unsigned AS, - const AMDGPUSubtarget &STI) { - switch (AS) { - case AMDGPUAS::GLOBAL_ADDRESS: { - // MUBUF instructions a 12-bit offset in bytes. - return isUInt<12>(OffsetSize); - } - case AMDGPUAS::CONSTANT_ADDRESS: { - // SMRD instructions have an 8-bit offset in dwords on SI and - // a 20-bit offset in bytes on VI. - if (STI.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) - return isUInt<20>(OffsetSize); - else - return (OffsetSize % 4 == 0) && isUInt<8>(OffsetSize / 4); - } - case AMDGPUAS::LOCAL_ADDRESS: - case AMDGPUAS::REGION_ADDRESS: { - // The single offset versions have a 16-bit offset in bytes. - return isUInt<16>(OffsetSize); - } - case AMDGPUAS::PRIVATE_ADDRESS: - // Indirect register addressing does not use any offsets. - default: - return 0; - } -} - -// (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2) - -// This is a variant of -// (mul (add x, c1), c2) -> add (mul x, c2), (mul c1, c2), -// -// The normal DAG combiner will do this, but only if the add has one use since -// that would increase the number of instructions. -// -// This prevents us from seeing a constant offset that can be folded into a -// memory instruction's addressing mode. If we know the resulting add offset of -// a pointer can be folded into an addressing offset, we can replace the pointer -// operand with the add of new constant offset. This eliminates one of the uses, -// and may allow the remaining use to also be simplified. -// -SDValue SITargetLowering::performSHLPtrCombine(SDNode *N, - unsigned AddrSpace, - DAGCombinerInfo &DCI) const { - SDValue N0 = N->getOperand(0); - SDValue N1 = N->getOperand(1); - - if (N0.getOpcode() != ISD::ADD) - return SDValue(); - - const ConstantSDNode *CN1 = dyn_cast(N1); - if (!CN1) - return SDValue(); - - const ConstantSDNode *CAdd = dyn_cast(N0.getOperand(1)); - if (!CAdd) - return SDValue(); - - // If the resulting offset is too large, we can't fold it into the addressing - // mode offset. - APInt Offset = CAdd->getAPIntValue() << CN1->getAPIntValue(); - if (!canFoldOffset(Offset.getZExtValue(), AddrSpace, *Subtarget)) - return SDValue(); - - SelectionDAG &DAG = DCI.DAG; - SDLoc SL(N); - EVT VT = N->getValueType(0); - - SDValue ShlX = DAG.getNode(ISD::SHL, SL, VT, N0.getOperand(0), N1); - SDValue COffset = DAG.getConstant(Offset, SL, MVT::i32); - - return DAG.getNode(ISD::ADD, SL, VT, ShlX, COffset); -} - -SDValue SITargetLowering::performAndCombine(SDNode *N, - DAGCombinerInfo &DCI) const { - if (DCI.isBeforeLegalize()) - return SDValue(); - - SelectionDAG &DAG = DCI.DAG; - - // (and (fcmp ord x, x), (fcmp une (fabs x), inf)) -> - // fp_class x, ~(s_nan | q_nan | n_infinity | p_infinity) - SDValue LHS = N->getOperand(0); - SDValue RHS = N->getOperand(1); - - if (LHS.getOpcode() == ISD::SETCC && - RHS.getOpcode() == ISD::SETCC) { - ISD::CondCode LCC = cast(LHS.getOperand(2))->get(); - ISD::CondCode RCC = cast(RHS.getOperand(2))->get(); - - SDValue X = LHS.getOperand(0); - SDValue Y = RHS.getOperand(0); - if (Y.getOpcode() != ISD::FABS || Y.getOperand(0) != X) - return SDValue(); - - if (LCC == ISD::SETO) { - if (X != LHS.getOperand(1)) - return SDValue(); - - if (RCC == ISD::SETUNE) { - const ConstantFPSDNode *C1 = dyn_cast(RHS.getOperand(1)); - if (!C1 || !C1->isInfinity() || C1->isNegative()) - return SDValue(); - - const uint32_t Mask = SIInstrFlags::N_NORMAL | - SIInstrFlags::N_SUBNORMAL | - SIInstrFlags::N_ZERO | - SIInstrFlags::P_ZERO | - SIInstrFlags::P_SUBNORMAL | - SIInstrFlags::P_NORMAL; - - static_assert(((~(SIInstrFlags::S_NAN | - SIInstrFlags::Q_NAN | - SIInstrFlags::N_INFINITY | - SIInstrFlags::P_INFINITY)) & 0x3ff) == Mask, - "mask not equal"); - - SDLoc DL(N); - return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, - X, DAG.getConstant(Mask, DL, MVT::i32)); - } - } - } - - return SDValue(); -} - -SDValue SITargetLowering::performOrCombine(SDNode *N, - DAGCombinerInfo &DCI) const { - SelectionDAG &DAG = DCI.DAG; - SDValue LHS = N->getOperand(0); - SDValue RHS = N->getOperand(1); - - // or (fp_class x, c1), (fp_class x, c2) -> fp_class x, (c1 | c2) - if (LHS.getOpcode() == AMDGPUISD::FP_CLASS && - RHS.getOpcode() == AMDGPUISD::FP_CLASS) { - SDValue Src = LHS.getOperand(0); - if (Src != RHS.getOperand(0)) - return SDValue(); - - const ConstantSDNode *CLHS = dyn_cast(LHS.getOperand(1)); - const ConstantSDNode *CRHS = dyn_cast(RHS.getOperand(1)); - if (!CLHS || !CRHS) - return SDValue(); - - // Only 10 bits are used. - static const uint32_t MaxMask = 0x3ff; - - uint32_t NewMask = (CLHS->getZExtValue() | CRHS->getZExtValue()) & MaxMask; - SDLoc DL(N); - return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, - Src, DAG.getConstant(NewMask, DL, MVT::i32)); - } - - return SDValue(); -} - -SDValue SITargetLowering::performClassCombine(SDNode *N, - DAGCombinerInfo &DCI) const { - SelectionDAG &DAG = DCI.DAG; - SDValue Mask = N->getOperand(1); - - // fp_class x, 0 -> false - if (const ConstantSDNode *CMask = dyn_cast(Mask)) { - if (CMask->isNullValue()) - return DAG.getConstant(0, SDLoc(N), MVT::i1); - } - - return SDValue(); -} - -static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) { - switch (Opc) { - case ISD::FMAXNUM: - return AMDGPUISD::FMAX3; - case ISD::SMAX: - return AMDGPUISD::SMAX3; - case ISD::UMAX: - return AMDGPUISD::UMAX3; - case ISD::FMINNUM: - return AMDGPUISD::FMIN3; - case ISD::SMIN: - return AMDGPUISD::SMIN3; - case ISD::UMIN: - return AMDGPUISD::UMIN3; - default: - llvm_unreachable("Not a min/max opcode"); - } -} - -SDValue SITargetLowering::performMin3Max3Combine(SDNode *N, - DAGCombinerInfo &DCI) const { - SelectionDAG &DAG = DCI.DAG; - - unsigned Opc = N->getOpcode(); - SDValue Op0 = N->getOperand(0); - SDValue Op1 = N->getOperand(1); - - // Only do this if the inner op has one use since this will just increases - // register pressure for no benefit. - - // max(max(a, b), c) - if (Op0.getOpcode() == Opc && Op0.hasOneUse()) { - SDLoc DL(N); - return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc), - DL, - N->getValueType(0), - Op0.getOperand(0), - Op0.getOperand(1), - Op1); - } - - // max(a, max(b, c)) - if (Op1.getOpcode() == Opc && Op1.hasOneUse()) { - SDLoc DL(N); - return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc), - DL, - N->getValueType(0), - Op0, - Op1.getOperand(0), - Op1.getOperand(1)); - } - - return SDValue(); -} - -SDValue SITargetLowering::performSetCCCombine(SDNode *N, - DAGCombinerInfo &DCI) const { - SelectionDAG &DAG = DCI.DAG; - SDLoc SL(N); - - SDValue LHS = N->getOperand(0); - SDValue RHS = N->getOperand(1); - EVT VT = LHS.getValueType(); - - if (VT != MVT::f32 && VT != MVT::f64) - return SDValue(); - - // Match isinf pattern - // (fcmp oeq (fabs x), inf) -> (fp_class x, (p_infinity | n_infinity)) - ISD::CondCode CC = cast(N->getOperand(2))->get(); - if (CC == ISD::SETOEQ && LHS.getOpcode() == ISD::FABS) { - const ConstantFPSDNode *CRHS = dyn_cast(RHS); - if (!CRHS) - return SDValue(); - - const APFloat &APF = CRHS->getValueAPF(); - if (APF.isInfinity() && !APF.isNegative()) { - unsigned Mask = SIInstrFlags::P_INFINITY | SIInstrFlags::N_INFINITY; - return DAG.getNode(AMDGPUISD::FP_CLASS, SL, MVT::i1, LHS.getOperand(0), - DAG.getConstant(Mask, SL, MVT::i32)); - } - } - - return SDValue(); -} - -SDValue SITargetLowering::PerformDAGCombine(SDNode *N, - DAGCombinerInfo &DCI) const { - SelectionDAG &DAG = DCI.DAG; - SDLoc DL(N); - - switch (N->getOpcode()) { - default: - return AMDGPUTargetLowering::PerformDAGCombine(N, DCI); - case ISD::SETCC: - return performSetCCCombine(N, DCI); - case ISD::FMAXNUM: // TODO: What about fmax_legacy? - case ISD::FMINNUM: - case ISD::SMAX: - case ISD::SMIN: - case ISD::UMAX: - case ISD::UMIN: { - if (DCI.getDAGCombineLevel() >= AfterLegalizeDAG && - N->getValueType(0) != MVT::f64 && - getTargetMachine().getOptLevel() > CodeGenOpt::None) - return performMin3Max3Combine(N, DCI); - break; - } - - case AMDGPUISD::CVT_F32_UBYTE0: - case AMDGPUISD::CVT_F32_UBYTE1: - case AMDGPUISD::CVT_F32_UBYTE2: - case AMDGPUISD::CVT_F32_UBYTE3: { - unsigned Offset = N->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0; - - SDValue Src = N->getOperand(0); - APInt Demanded = APInt::getBitsSet(32, 8 * Offset, 8 * Offset + 8); - - APInt KnownZero, KnownOne; - TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(), - !DCI.isBeforeLegalizeOps()); - const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - if (TLO.ShrinkDemandedConstant(Src, Demanded) || - TLI.SimplifyDemandedBits(Src, Demanded, KnownZero, KnownOne, TLO)) { - DCI.CommitTargetLoweringOpt(TLO); - } - - break; - } - - case ISD::UINT_TO_FP: { - return performUCharToFloatCombine(N, DCI); - - case ISD::FADD: { - if (DCI.getDAGCombineLevel() < AfterLegalizeDAG) - break; - - EVT VT = N->getValueType(0); - if (VT != MVT::f32) - break; - - // Only do this if we are not trying to support denormals. v_mad_f32 does - // not support denormals ever. - if (Subtarget->hasFP32Denormals()) - break; - - SDValue LHS = N->getOperand(0); - SDValue RHS = N->getOperand(1); - - // These should really be instruction patterns, but writing patterns with - // source modiifiers is a pain. - - // fadd (fadd (a, a), b) -> mad 2.0, a, b - if (LHS.getOpcode() == ISD::FADD) { - SDValue A = LHS.getOperand(0); - if (A == LHS.getOperand(1)) { - const SDValue Two = DAG.getConstantFP(2.0, DL, MVT::f32); - return DAG.getNode(ISD::FMAD, DL, VT, Two, A, RHS); - } - } - - // fadd (b, fadd (a, a)) -> mad 2.0, a, b - if (RHS.getOpcode() == ISD::FADD) { - SDValue A = RHS.getOperand(0); - if (A == RHS.getOperand(1)) { - const SDValue Two = DAG.getConstantFP(2.0, DL, MVT::f32); - return DAG.getNode(ISD::FMAD, DL, VT, Two, A, LHS); - } - } - - return SDValue(); - } - case ISD::FSUB: { - if (DCI.getDAGCombineLevel() < AfterLegalizeDAG) - break; - - EVT VT = N->getValueType(0); - - // Try to get the fneg to fold into the source modifier. This undoes generic - // DAG combines and folds them into the mad. - // - // Only do this if we are not trying to support denormals. v_mad_f32 does - // not support denormals ever. - if (VT == MVT::f32 && - !Subtarget->hasFP32Denormals()) { - SDValue LHS = N->getOperand(0); - SDValue RHS = N->getOperand(1); - if (LHS.getOpcode() == ISD::FADD) { - // (fsub (fadd a, a), c) -> mad 2.0, a, (fneg c) - - SDValue A = LHS.getOperand(0); - if (A == LHS.getOperand(1)) { - const SDValue Two = DAG.getConstantFP(2.0, DL, MVT::f32); - SDValue NegRHS = DAG.getNode(ISD::FNEG, DL, VT, RHS); - - return DAG.getNode(ISD::FMAD, DL, VT, Two, A, NegRHS); - } - } - - if (RHS.getOpcode() == ISD::FADD) { - // (fsub c, (fadd a, a)) -> mad -2.0, a, c - - SDValue A = RHS.getOperand(0); - if (A == RHS.getOperand(1)) { - const SDValue NegTwo = DAG.getConstantFP(-2.0, DL, MVT::f32); - return DAG.getNode(ISD::FMAD, DL, VT, NegTwo, A, LHS); - } - } - - return SDValue(); - } - - break; - } - } - case ISD::LOAD: - case ISD::STORE: - case ISD::ATOMIC_LOAD: - case ISD::ATOMIC_STORE: - case ISD::ATOMIC_CMP_SWAP: - case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: - case ISD::ATOMIC_SWAP: - case ISD::ATOMIC_LOAD_ADD: - case ISD::ATOMIC_LOAD_SUB: - case ISD::ATOMIC_LOAD_AND: - case ISD::ATOMIC_LOAD_OR: - case ISD::ATOMIC_LOAD_XOR: - case ISD::ATOMIC_LOAD_NAND: - case ISD::ATOMIC_LOAD_MIN: - case ISD::ATOMIC_LOAD_MAX: - case ISD::ATOMIC_LOAD_UMIN: - case ISD::ATOMIC_LOAD_UMAX: { // TODO: Target mem intrinsics. - if (DCI.isBeforeLegalize()) - break; - - MemSDNode *MemNode = cast(N); - SDValue Ptr = MemNode->getBasePtr(); - - // TODO: We could also do this for multiplies. - unsigned AS = MemNode->getAddressSpace(); - if (Ptr.getOpcode() == ISD::SHL && AS != AMDGPUAS::PRIVATE_ADDRESS) { - SDValue NewPtr = performSHLPtrCombine(Ptr.getNode(), AS, DCI); - if (NewPtr) { - SmallVector NewOps(MemNode->op_begin(), MemNode->op_end()); - - NewOps[N->getOpcode() == ISD::STORE ? 2 : 1] = NewPtr; - return SDValue(DAG.UpdateNodeOperands(MemNode, NewOps), 0); - } - } - break; - } - case ISD::AND: - return performAndCombine(N, DCI); - case ISD::OR: - return performOrCombine(N, DCI); - case AMDGPUISD::FP_CLASS: - return performClassCombine(N, DCI); - } - return AMDGPUTargetLowering::PerformDAGCombine(N, DCI); -} - -/// \brief Analyze the possible immediate value Op -/// -/// Returns -1 if it isn't an immediate, 0 if it's and inline immediate -/// and the immediate value if it's a literal immediate -int32_t SITargetLowering::analyzeImmediate(const SDNode *N) const { - - const SIInstrInfo *TII = - static_cast(Subtarget->getInstrInfo()); - - if (const ConstantSDNode *Node = dyn_cast(N)) { - if (TII->isInlineConstant(Node->getAPIntValue())) - return 0; - - uint64_t Val = Node->getZExtValue(); - return isUInt<32>(Val) ? Val : -1; - } - - if (const ConstantFPSDNode *Node = dyn_cast(N)) { - if (TII->isInlineConstant(Node->getValueAPF().bitcastToAPInt())) - return 0; - - if (Node->getValueType(0) == MVT::f32) - return FloatToBits(Node->getValueAPF().convertToFloat()); - - return -1; - } - - return -1; -} - -/// \brief Helper function for adjustWritemask -static unsigned SubIdx2Lane(unsigned Idx) { - switch (Idx) { - default: return 0; - case AMDGPU::sub0: return 0; - case AMDGPU::sub1: return 1; - case AMDGPU::sub2: return 2; - case AMDGPU::sub3: return 3; - } -} - -/// \brief Adjust the writemask of MIMG instructions -void SITargetLowering::adjustWritemask(MachineSDNode *&Node, - SelectionDAG &DAG) const { - SDNode *Users[4] = { }; - unsigned Lane = 0; - unsigned OldDmask = Node->getConstantOperandVal(0); - unsigned NewDmask = 0; - - // Try to figure out the used register components - for (SDNode::use_iterator I = Node->use_begin(), E = Node->use_end(); - I != E; ++I) { - - // Abort if we can't understand the usage - if (!I->isMachineOpcode() || - I->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG) - return; - - // Lane means which subreg of %VGPRa_VGPRb_VGPRc_VGPRd is used. - // Note that subregs are packed, i.e. Lane==0 is the first bit set - // in OldDmask, so it can be any of X,Y,Z,W; Lane==1 is the second bit - // set, etc. - Lane = SubIdx2Lane(I->getConstantOperandVal(1)); - - // Set which texture component corresponds to the lane. - unsigned Comp; - for (unsigned i = 0, Dmask = OldDmask; i <= Lane; i++) { - assert(Dmask); - Comp = countTrailingZeros(Dmask); - Dmask &= ~(1 << Comp); - } - - // Abort if we have more than one user per component - if (Users[Lane]) - return; - - Users[Lane] = *I; - NewDmask |= 1 << Comp; - } - - // Abort if there's no change - if (NewDmask == OldDmask) - return; - - // Adjust the writemask in the node - std::vector Ops; - Ops.push_back(DAG.getTargetConstant(NewDmask, SDLoc(Node), MVT::i32)); - Ops.insert(Ops.end(), Node->op_begin() + 1, Node->op_end()); - Node = (MachineSDNode*)DAG.UpdateNodeOperands(Node, Ops); - - // If we only got one lane, replace it with a copy - // (if NewDmask has only one bit set...) - if (NewDmask && (NewDmask & (NewDmask-1)) == 0) { - SDValue RC = DAG.getTargetConstant(AMDGPU::VGPR_32RegClassID, SDLoc(), - MVT::i32); - SDNode *Copy = DAG.getMachineNode(TargetOpcode::COPY_TO_REGCLASS, - SDLoc(), Users[Lane]->getValueType(0), - SDValue(Node, 0), RC); - DAG.ReplaceAllUsesWith(Users[Lane], Copy); - return; - } - - // Update the users of the node with the new indices - for (unsigned i = 0, Idx = AMDGPU::sub0; i < 4; ++i) { - - SDNode *User = Users[i]; - if (!User) - continue; - - SDValue Op = DAG.getTargetConstant(Idx, SDLoc(User), MVT::i32); - DAG.UpdateNodeOperands(User, User->getOperand(0), Op); - - switch (Idx) { - default: break; - case AMDGPU::sub0: Idx = AMDGPU::sub1; break; - case AMDGPU::sub1: Idx = AMDGPU::sub2; break; - case AMDGPU::sub2: Idx = AMDGPU::sub3; break; - } - } -} - -/// \brief Legalize target independent instructions (e.g. INSERT_SUBREG) -/// with frame index operands. -/// LLVM assumes that inputs are to these instructions are registers. -void SITargetLowering::legalizeTargetIndependentNode(SDNode *Node, - SelectionDAG &DAG) const { - - SmallVector Ops; - for (unsigned i = 0; i < Node->getNumOperands(); ++i) { - if (!isa(Node->getOperand(i))) { - Ops.push_back(Node->getOperand(i)); - continue; - } - - SDLoc DL(Node); - Ops.push_back(SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, - Node->getOperand(i).getValueType(), - Node->getOperand(i)), 0)); - } - - DAG.UpdateNodeOperands(Node, Ops); -} - -/// \brief Fold the instructions after selecting them. -SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node, - SelectionDAG &DAG) const { - const SIInstrInfo *TII = - static_cast(Subtarget->getInstrInfo()); - - if (TII->isMIMG(Node->getMachineOpcode())) - adjustWritemask(Node, DAG); - - if (Node->getMachineOpcode() == AMDGPU::INSERT_SUBREG || - Node->getMachineOpcode() == AMDGPU::REG_SEQUENCE) { - legalizeTargetIndependentNode(Node, DAG); - return Node; - } - return Node; -} - -/// \brief Assign the register class depending on the number of -/// bits set in the writemask -void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr *MI, - SDNode *Node) const { - const SIInstrInfo *TII = - static_cast(Subtarget->getInstrInfo()); - - MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); - TII->legalizeOperands(MI); - - if (TII->isMIMG(MI->getOpcode())) { - unsigned VReg = MI->getOperand(0).getReg(); - unsigned Writemask = MI->getOperand(1).getImm(); - unsigned BitsSet = 0; - for (unsigned i = 0; i < 4; ++i) - BitsSet += Writemask & (1 << i) ? 1 : 0; - - const TargetRegisterClass *RC; - switch (BitsSet) { - default: return; - case 1: RC = &AMDGPU::VGPR_32RegClass; break; - case 2: RC = &AMDGPU::VReg_64RegClass; break; - case 3: RC = &AMDGPU::VReg_96RegClass; break; - } - - unsigned NewOpcode = TII->getMaskedMIMGOp(MI->getOpcode(), BitsSet); - MI->setDesc(TII->get(NewOpcode)); - MRI.setRegClass(VReg, RC); - return; - } - - // Replace unused atomics with the no return version. - int NoRetAtomicOp = AMDGPU::getAtomicNoRetOp(MI->getOpcode()); - if (NoRetAtomicOp != -1) { - if (!Node->hasAnyUseOfValue(0)) { - MI->setDesc(TII->get(NoRetAtomicOp)); - MI->RemoveOperand(0); - } - - return; - } -} - -static SDValue buildSMovImm32(SelectionDAG &DAG, SDLoc DL, uint64_t Val) { - SDValue K = DAG.getTargetConstant(Val, DL, MVT::i32); - return SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, K), 0); -} - -MachineSDNode *SITargetLowering::wrapAddr64Rsrc(SelectionDAG &DAG, - SDLoc DL, - SDValue Ptr) const { - const SIInstrInfo *TII = - static_cast(Subtarget->getInstrInfo()); -#if 1 - // XXX - Workaround for moveToVALU not handling different register class - // inserts for REG_SEQUENCE. - - // Build the half of the subregister with the constants. - const SDValue Ops0[] = { - DAG.getTargetConstant(AMDGPU::SGPR_64RegClassID, DL, MVT::i32), - buildSMovImm32(DAG, DL, 0), - DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32), - buildSMovImm32(DAG, DL, TII->getDefaultRsrcDataFormat() >> 32), - DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32) - }; - - SDValue SubRegHi = SDValue(DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, - MVT::v2i32, Ops0), 0); - - // Combine the constants and the pointer. - const SDValue Ops1[] = { - DAG.getTargetConstant(AMDGPU::SReg_128RegClassID, DL, MVT::i32), - Ptr, - DAG.getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32), - SubRegHi, - DAG.getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32) - }; - - return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops1); -#else - const SDValue Ops[] = { - DAG.getTargetConstant(AMDGPU::SReg_128RegClassID, MVT::i32), - Ptr, - DAG.getTargetConstant(AMDGPU::sub0_sub1, MVT::i32), - buildSMovImm32(DAG, DL, 0), - DAG.getTargetConstant(AMDGPU::sub2, MVT::i32), - buildSMovImm32(DAG, DL, TII->getDefaultRsrcFormat() >> 32), - DAG.getTargetConstant(AMDGPU::sub3, MVT::i32) - }; - - return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops); - -#endif -} - -/// \brief Return a resource descriptor with the 'Add TID' bit enabled -/// The TID (Thread ID) is multipled by the stride value (bits [61:48] -/// of the resource descriptor) to create an offset, which is added to the -/// resource ponter. -MachineSDNode *SITargetLowering::buildRSRC(SelectionDAG &DAG, - SDLoc DL, - SDValue Ptr, - uint32_t RsrcDword1, - uint64_t RsrcDword2And3) const { - SDValue PtrLo = DAG.getTargetExtractSubreg(AMDGPU::sub0, DL, MVT::i32, Ptr); - SDValue PtrHi = DAG.getTargetExtractSubreg(AMDGPU::sub1, DL, MVT::i32, Ptr); - if (RsrcDword1) { - PtrHi = SDValue(DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, PtrHi, - DAG.getConstant(RsrcDword1, DL, MVT::i32)), - 0); - } - - SDValue DataLo = buildSMovImm32(DAG, DL, - RsrcDword2And3 & UINT64_C(0xFFFFFFFF)); - SDValue DataHi = buildSMovImm32(DAG, DL, RsrcDword2And3 >> 32); - - const SDValue Ops[] = { - DAG.getTargetConstant(AMDGPU::SReg_128RegClassID, DL, MVT::i32), - PtrLo, - DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32), - PtrHi, - DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32), - DataLo, - DAG.getTargetConstant(AMDGPU::sub2, DL, MVT::i32), - DataHi, - DAG.getTargetConstant(AMDGPU::sub3, DL, MVT::i32) - }; - - return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops); -} - -MachineSDNode *SITargetLowering::buildScratchRSRC(SelectionDAG &DAG, - SDLoc DL, - SDValue Ptr) const { - const SIInstrInfo *TII = - static_cast(Subtarget->getInstrInfo()); - uint64_t Rsrc = TII->getDefaultRsrcDataFormat() | AMDGPU::RSRC_TID_ENABLE | - 0xffffffff; // Size - - return buildRSRC(DAG, DL, Ptr, 0, Rsrc); -} - -SDValue SITargetLowering::CreateLiveInRegister(SelectionDAG &DAG, - const TargetRegisterClass *RC, - unsigned Reg, EVT VT) const { - SDValue VReg = AMDGPUTargetLowering::CreateLiveInRegister(DAG, RC, Reg, VT); - - return DAG.getCopyFromReg(DAG.getEntryNode(), SDLoc(DAG.getEntryNode()), - cast(VReg)->getReg(), VT); -} - -//===----------------------------------------------------------------------===// -// SI Inline Assembly Support -//===----------------------------------------------------------------------===// - -std::pair -SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, - const std::string &Constraint, - MVT VT) const { - if (Constraint == "r") { - switch(VT.SimpleTy) { - default: llvm_unreachable("Unhandled type for 'r' inline asm constraint"); - case MVT::i64: - return std::make_pair(0U, &AMDGPU::SGPR_64RegClass); - case MVT::i32: - return std::make_pair(0U, &AMDGPU::SGPR_32RegClass); - } - } - - if (Constraint.size() > 1) { - const TargetRegisterClass *RC = nullptr; - if (Constraint[1] == 'v') { - RC = &AMDGPU::VGPR_32RegClass; - } else if (Constraint[1] == 's') { - RC = &AMDGPU::SGPR_32RegClass; - } - - if (RC) { - unsigned Idx = std::atoi(Constraint.substr(2).c_str()); - if (Idx < RC->getNumRegs()) - return std::make_pair(RC->getRegister(Idx), RC); - } - } - return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT); -} Index: projects/clang370-import/contrib/llvm/lib/Target/R600/SIDefines.h =================================================================== --- projects/clang370-import/contrib/llvm/lib/Target/R600/SIDefines.h (revision 288077) +++ projects/clang370-import/contrib/llvm/lib/Target/R600/SIDefines.h (nonexistent) @@ -1,172 +0,0 @@ -//===-- SIDefines.h - SI Helper Macros ----------------------*- C++ -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -/// \file -//===----------------------------------------------------------------------===// - -#include "llvm/MC/MCInstrDesc.h" - -#ifndef LLVM_LIB_TARGET_R600_SIDEFINES_H -#define LLVM_LIB_TARGET_R600_SIDEFINES_H - -namespace SIInstrFlags { -// This needs to be kept in sync with the field bits in InstSI. -enum { - SALU = 1 << 3, - VALU = 1 << 4, - - SOP1 = 1 << 5, - SOP2 = 1 << 6, - SOPC = 1 << 7, - SOPK = 1 << 8, - SOPP = 1 << 9, - - VOP1 = 1 << 10, - VOP2 = 1 << 11, - VOP3 = 1 << 12, - VOPC = 1 << 13, - - MUBUF = 1 << 14, - MTBUF = 1 << 15, - SMRD = 1 << 16, - DS = 1 << 17, - MIMG = 1 << 18, - FLAT = 1 << 19, - WQM = 1 << 20, - VGPRSpill = 1 << 21 -}; -} - -namespace llvm { -namespace AMDGPU { - enum OperandType { - /// Operand with register or 32-bit immediate - OPERAND_REG_IMM32 = llvm::MCOI::OPERAND_FIRST_TARGET, - /// Operand with register or inline constant - OPERAND_REG_INLINE_C - }; -} -} - -namespace SIInstrFlags { - enum Flags { - // First 4 bits are the instruction encoding - VM_CNT = 1 << 0, - EXP_CNT = 1 << 1, - LGKM_CNT = 1 << 2 - }; - - // v_cmp_class_* etc. use a 10-bit mask for what operation is checked. - // The result is true if any of these tests are true. - enum ClassFlags { - S_NAN = 1 << 0, // Signaling NaN - Q_NAN = 1 << 1, // Quiet NaN - N_INFINITY = 1 << 2, // Negative infinity - N_NORMAL = 1 << 3, // Negative normal - N_SUBNORMAL = 1 << 4, // Negative subnormal - N_ZERO = 1 << 5, // Negative zero - P_ZERO = 1 << 6, // Positive zero - P_SUBNORMAL = 1 << 7, // Positive subnormal - P_NORMAL = 1 << 8, // Positive normal - P_INFINITY = 1 << 9 // Positive infinity - }; -} - -namespace SISrcMods { - enum { - NEG = 1 << 0, - ABS = 1 << 1 - }; -} - -namespace SIOutMods { - enum { - NONE = 0, - MUL2 = 1, - MUL4 = 2, - DIV2 = 3 - }; -} - -#define R_00B028_SPI_SHADER_PGM_RSRC1_PS 0x00B028 -#define R_00B02C_SPI_SHADER_PGM_RSRC2_PS 0x00B02C -#define S_00B02C_EXTRA_LDS_SIZE(x) (((x) & 0xFF) << 8) -#define R_00B128_SPI_SHADER_PGM_RSRC1_VS 0x00B128 -#define R_00B228_SPI_SHADER_PGM_RSRC1_GS 0x00B228 -#define R_00B848_COMPUTE_PGM_RSRC1 0x00B848 -#define S_00B028_VGPRS(x) (((x) & 0x3F) << 0) -#define S_00B028_SGPRS(x) (((x) & 0x0F) << 6) -#define R_00B84C_COMPUTE_PGM_RSRC2 0x00B84C -#define S_00B84C_SCRATCH_EN(x) (((x) & 0x1) << 0) -#define S_00B84C_USER_SGPR(x) (((x) & 0x1F) << 1) -#define S_00B84C_TGID_X_EN(x) (((x) & 0x1) << 7) -#define S_00B84C_TGID_Y_EN(x) (((x) & 0x1) << 8) -#define S_00B84C_TGID_Z_EN(x) (((x) & 0x1) << 9) -#define S_00B84C_TG_SIZE_EN(x) (((x) & 0x1) << 10) -#define S_00B84C_TIDIG_COMP_CNT(x) (((x) & 0x03) << 11) - -#define S_00B84C_LDS_SIZE(x) (((x) & 0x1FF) << 15) -#define R_0286CC_SPI_PS_INPUT_ENA 0x0286CC - - -#define R_00B848_COMPUTE_PGM_RSRC1 0x00B848 -#define S_00B848_VGPRS(x) (((x) & 0x3F) << 0) -#define G_00B848_VGPRS(x) (((x) >> 0) & 0x3F) -#define C_00B848_VGPRS 0xFFFFFFC0 -#define S_00B848_SGPRS(x) (((x) & 0x0F) << 6) -#define G_00B848_SGPRS(x) (((x) >> 6) & 0x0F) -#define C_00B848_SGPRS 0xFFFFFC3F -#define S_00B848_PRIORITY(x) (((x) & 0x03) << 10) -#define G_00B848_PRIORITY(x) (((x) >> 10) & 0x03) -#define C_00B848_PRIORITY 0xFFFFF3FF -#define S_00B848_FLOAT_MODE(x) (((x) & 0xFF) << 12) -#define G_00B848_FLOAT_MODE(x) (((x) >> 12) & 0xFF) -#define C_00B848_FLOAT_MODE 0xFFF00FFF -#define S_00B848_PRIV(x) (((x) & 0x1) << 20) -#define G_00B848_PRIV(x) (((x) >> 20) & 0x1) -#define C_00B848_PRIV 0xFFEFFFFF -#define S_00B848_DX10_CLAMP(x) (((x) & 0x1) << 21) -#define G_00B848_DX10_CLAMP(x) (((x) >> 21) & 0x1) -#define C_00B848_DX10_CLAMP 0xFFDFFFFF -#define S_00B848_DEBUG_MODE(x) (((x) & 0x1) << 22) -#define G_00B848_DEBUG_MODE(x) (((x) >> 22) & 0x1) -#define C_00B848_DEBUG_MODE 0xFFBFFFFF -#define S_00B848_IEEE_MODE(x) (((x) & 0x1) << 23) -#define G_00B848_IEEE_MODE(x) (((x) >> 23) & 0x1) -#define C_00B848_IEEE_MODE 0xFF7FFFFF - - -// Helpers for setting FLOAT_MODE -#define FP_ROUND_ROUND_TO_NEAREST 0 -#define FP_ROUND_ROUND_TO_INF 1 -#define FP_ROUND_ROUND_TO_NEGINF 2 -#define FP_ROUND_ROUND_TO_ZERO 3 - -// Bits 3:0 control rounding mode. 1:0 control single precision, 3:2 double -// precision. -#define FP_ROUND_MODE_SP(x) ((x) & 0x3) -#define FP_ROUND_MODE_DP(x) (((x) & 0x3) << 2) - -#define FP_DENORM_FLUSH_IN_FLUSH_OUT 0 -#define FP_DENORM_FLUSH_OUT 1 -#define FP_DENORM_FLUSH_IN 2 -#define FP_DENORM_FLUSH_NONE 3 - - -// Bits 7:4 control denormal handling. 5:4 control single precision, 6:7 double -// precision. -#define FP_DENORM_MODE_SP(x) (((x) & 0x3) << 4) -#define FP_DENORM_MODE_DP(x) (((x) & 0x3) << 6) - -#define R_00B860_COMPUTE_TMPRING_SIZE 0x00B860 -#define S_00B860_WAVESIZE(x) (((x) & 0x1FFF) << 12) - -#define R_0286E8_SPI_TMPRING_SIZE 0x0286E8 -#define S_0286E8_WAVESIZE(x) (((x) & 0x1FFF) << 12) - - -#endif Index: projects/clang370-import/contrib/llvm/lib/Target/R600/SIMachineFunctionInfo.cpp =================================================================== --- projects/clang370-import/contrib/llvm/lib/Target/R600/SIMachineFunctionInfo.cpp (revision 288077) +++ projects/clang370-import/contrib/llvm/lib/Target/R600/SIMachineFunctionInfo.cpp (nonexistent) @@ -1,77 +0,0 @@ -//===-- SIMachineFunctionInfo.cpp - SI Machine Function Info -------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -/// \file -//===----------------------------------------------------------------------===// - - -#include "SIMachineFunctionInfo.h" -#include "AMDGPUSubtarget.h" -#include "SIInstrInfo.h" -#include "llvm/CodeGen/MachineInstrBuilder.h" -#include "llvm/CodeGen/MachineFrameInfo.h" -#include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/IR/Function.h" -#include "llvm/IR/LLVMContext.h" - -#define MAX_LANES 64 - -using namespace llvm; - - -// Pin the vtable to this file. -void SIMachineFunctionInfo::anchor() {} - -SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) - : AMDGPUMachineFunction(MF), - TIDReg(AMDGPU::NoRegister), - HasSpilledVGPRs(false), - PSInputAddr(0), - NumUserSGPRs(0), - LDSWaveSpillSize(0) { } - -SIMachineFunctionInfo::SpilledReg SIMachineFunctionInfo::getSpilledReg( - MachineFunction *MF, - unsigned FrameIndex, - unsigned SubIdx) { - const MachineFrameInfo *FrameInfo = MF->getFrameInfo(); - const SIRegisterInfo *TRI = static_cast( - MF->getSubtarget().getRegisterInfo()); - MachineRegisterInfo &MRI = MF->getRegInfo(); - int64_t Offset = FrameInfo->getObjectOffset(FrameIndex); - Offset += SubIdx * 4; - - unsigned LaneVGPRIdx = Offset / (64 * 4); - unsigned Lane = (Offset / 4) % 64; - - struct SpilledReg Spill; - - if (!LaneVGPRs.count(LaneVGPRIdx)) { - unsigned LaneVGPR = TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass); - LaneVGPRs[LaneVGPRIdx] = LaneVGPR; - MRI.setPhysRegUsed(LaneVGPR); - - // Add this register as live-in to all blocks to avoid machine verifer - // complaining about use of an undefined physical register. - for (MachineFunction::iterator BI = MF->begin(), BE = MF->end(); - BI != BE; ++BI) { - BI->addLiveIn(LaneVGPR); - } - } - - Spill.VGPR = LaneVGPRs[LaneVGPRIdx]; - Spill.Lane = Lane; - return Spill; -} - -unsigned SIMachineFunctionInfo::getMaximumWorkGroupSize( - const MachineFunction &MF) const { - const AMDGPUSubtarget &ST = MF.getSubtarget(); - // FIXME: We should get this information from kernel attributes if it - // is available. - return getShaderType() == ShaderType::COMPUTE ? 256 : ST.getWavefrontSize(); -} Index: projects/clang370-import/contrib/llvm/lib/Target/R600/AMDGPUTargetMachine.cpp =================================================================== --- projects/clang370-import/contrib/llvm/lib/Target/R600/AMDGPUTargetMachine.cpp (revision 288077) +++ projects/clang370-import/contrib/llvm/lib/Target/R600/AMDGPUTargetMachine.cpp (nonexistent) @@ -1,292 +0,0 @@ -//===-- AMDGPUTargetMachine.cpp - TargetMachine for hw codegen targets-----===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -/// \file -/// \brief The AMDGPU target machine contains all of the hardware specific -/// information needed to emit code for R600 and SI GPUs. -// -//===----------------------------------------------------------------------===// - -#include "AMDGPUTargetMachine.h" -#include "AMDGPU.h" -#include "AMDGPUTargetTransformInfo.h" -#include "R600ISelLowering.h" -#include "R600InstrInfo.h" -#include "R600MachineScheduler.h" -#include "SIISelLowering.h" -#include "SIInstrInfo.h" -#include "llvm/Analysis/Passes.h" -#include "llvm/CodeGen/MachineFunctionAnalysis.h" -#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" -#include "llvm/CodeGen/MachineModuleInfo.h" -#include "llvm/CodeGen/Passes.h" -#include "llvm/IR/Verifier.h" -#include "llvm/MC/MCAsmInfo.h" -#include "llvm/IR/LegacyPassManager.h" -#include "llvm/Support/TargetRegistry.h" -#include "llvm/Support/raw_os_ostream.h" -#include "llvm/Transforms/IPO.h" -#include "llvm/Transforms/Scalar.h" -#include - -using namespace llvm; - -extern "C" void LLVMInitializeR600Target() { - // Register the target - RegisterTargetMachine X(TheAMDGPUTarget); - RegisterTargetMachine Y(TheGCNTarget); -} - -static ScheduleDAGInstrs *createR600MachineScheduler(MachineSchedContext *C) { - return new ScheduleDAGMILive(C, make_unique()); -} - -static MachineSchedRegistry -SchedCustomRegistry("r600", "Run R600's custom scheduler", - createR600MachineScheduler); - -static std::string computeDataLayout(StringRef TT) { - Triple Triple(TT); - std::string Ret = "e-p:32:32"; - - if (Triple.getArch() == Triple::amdgcn) { - // 32-bit private, local, and region pointers. 64-bit global and constant. - Ret += "-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:64:64"; - } - - Ret += "-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256" - "-v512:512-v1024:1024-v2048:2048-n32:64"; - - return Ret; -} - -AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, StringRef TT, - StringRef CPU, StringRef FS, - TargetOptions Options, Reloc::Model RM, - CodeModel::Model CM, - CodeGenOpt::Level OptLevel) - : LLVMTargetMachine(T, computeDataLayout(TT), TT, CPU, FS, Options, RM, CM, - OptLevel), - TLOF(new TargetLoweringObjectFileELF()), Subtarget(TT, CPU, FS, *this), - IntrinsicInfo() { - setRequiresStructuredCFG(true); - initAsmInfo(); -} - -AMDGPUTargetMachine::~AMDGPUTargetMachine() { - delete TLOF; -} - -//===----------------------------------------------------------------------===// -// R600 Target Machine (R600 -> Cayman) -//===----------------------------------------------------------------------===// - -R600TargetMachine::R600TargetMachine(const Target &T, StringRef TT, StringRef FS, - StringRef CPU, TargetOptions Options, Reloc::Model RM, - CodeModel::Model CM, CodeGenOpt::Level OL) : - AMDGPUTargetMachine(T, TT, FS, CPU, Options, RM, CM, OL) { } - - -//===----------------------------------------------------------------------===// -// GCN Target Machine (SI+) -//===----------------------------------------------------------------------===// - -GCNTargetMachine::GCNTargetMachine(const Target &T, StringRef TT, StringRef FS, - StringRef CPU, TargetOptions Options, Reloc::Model RM, - CodeModel::Model CM, CodeGenOpt::Level OL) : - AMDGPUTargetMachine(T, TT, FS, CPU, Options, RM, CM, OL) { } - -//===----------------------------------------------------------------------===// -// AMDGPU Pass Setup -//===----------------------------------------------------------------------===// - -namespace { -class AMDGPUPassConfig : public TargetPassConfig { -public: - AMDGPUPassConfig(TargetMachine *TM, PassManagerBase &PM) - : TargetPassConfig(TM, PM) {} - - AMDGPUTargetMachine &getAMDGPUTargetMachine() const { - return getTM(); - } - - ScheduleDAGInstrs * - createMachineScheduler(MachineSchedContext *C) const override { - const AMDGPUSubtarget &ST = *getAMDGPUTargetMachine().getSubtargetImpl(); - if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) - return createR600MachineScheduler(C); - return nullptr; - } - - void addIRPasses() override; - void addCodeGenPrepare() override; - virtual bool addPreISel() override; - virtual bool addInstSelector() override; -}; - -class R600PassConfig : public AMDGPUPassConfig { -public: - R600PassConfig(TargetMachine *TM, PassManagerBase &PM) - : AMDGPUPassConfig(TM, PM) { } - - bool addPreISel() override; - void addPreRegAlloc() override; - void addPreSched2() override; - void addPreEmitPass() override; -}; - -class GCNPassConfig : public AMDGPUPassConfig { -public: - GCNPassConfig(TargetMachine *TM, PassManagerBase &PM) - : AMDGPUPassConfig(TM, PM) { } - bool addPreISel() override; - bool addInstSelector() override; - void addPreRegAlloc() override; - void addPostRegAlloc() override; - void addPreSched2() override; - void addPreEmitPass() override; -}; - -} // End of anonymous namespace - -TargetIRAnalysis AMDGPUTargetMachine::getTargetIRAnalysis() { - return TargetIRAnalysis( - [this](Function &F) { return TargetTransformInfo(AMDGPUTTIImpl(this)); }); -} - -void AMDGPUPassConfig::addIRPasses() { - // Function calls are not supported, so make sure we inline everything. - addPass(createAMDGPUAlwaysInlinePass()); - addPass(createAlwaysInlinerPass()); - // We need to add the barrier noop pass, otherwise adding the function - // inlining pass will cause all of the PassConfigs passes to be run - // one function at a time, which means if we have a nodule with two - // functions, then we will generate code for the first function - // without ever running any passes on the second. - addPass(createBarrierNoopPass()); - TargetPassConfig::addIRPasses(); -} - -void AMDGPUPassConfig::addCodeGenPrepare() { - const AMDGPUSubtarget &ST = *getAMDGPUTargetMachine().getSubtargetImpl(); - if (ST.isPromoteAllocaEnabled()) { - addPass(createAMDGPUPromoteAlloca(ST)); - addPass(createSROAPass()); - } - TargetPassConfig::addCodeGenPrepare(); -} - -bool -AMDGPUPassConfig::addPreISel() { - const AMDGPUSubtarget &ST = *getAMDGPUTargetMachine().getSubtargetImpl(); - addPass(createFlattenCFGPass()); - if (ST.IsIRStructurizerEnabled()) - addPass(createStructurizeCFGPass()); - return false; -} - -bool AMDGPUPassConfig::addInstSelector() { - addPass(createAMDGPUISelDag(getAMDGPUTargetMachine())); - return false; -} - -//===----------------------------------------------------------------------===// -// R600 Pass Setup -//===----------------------------------------------------------------------===// - -bool R600PassConfig::addPreISel() { - AMDGPUPassConfig::addPreISel(); - addPass(createR600TextureIntrinsicsReplacer()); - return false; -} - -void R600PassConfig::addPreRegAlloc() { - addPass(createR600VectorRegMerger(*TM)); -} - -void R600PassConfig::addPreSched2() { - const AMDGPUSubtarget &ST = *getAMDGPUTargetMachine().getSubtargetImpl(); - addPass(createR600EmitClauseMarkers(), false); - if (ST.isIfCvtEnabled()) - addPass(&IfConverterID, false); - addPass(createR600ClauseMergePass(*TM), false); -} - -void R600PassConfig::addPreEmitPass() { - addPass(createAMDGPUCFGStructurizerPass(), false); - addPass(createR600ExpandSpecialInstrsPass(*TM), false); - addPass(&FinalizeMachineBundlesID, false); - addPass(createR600Packetizer(*TM), false); - addPass(createR600ControlFlowFinalizer(*TM), false); -} - -TargetPassConfig *R600TargetMachine::createPassConfig(PassManagerBase &PM) { - return new R600PassConfig(this, PM); -} - -//===----------------------------------------------------------------------===// -// GCN Pass Setup -//===----------------------------------------------------------------------===// - -bool GCNPassConfig::addPreISel() { - AMDGPUPassConfig::addPreISel(); - addPass(createSinkingPass()); - addPass(createSITypeRewriter()); - addPass(createSIAnnotateControlFlowPass()); - return false; -} - -bool GCNPassConfig::addInstSelector() { - AMDGPUPassConfig::addInstSelector(); - addPass(createSILowerI1CopiesPass()); - addPass(createSIFixSGPRCopiesPass(*TM)); - addPass(createSIFoldOperandsPass()); - return false; -} - -void GCNPassConfig::addPreRegAlloc() { - const AMDGPUSubtarget &ST = *getAMDGPUTargetMachine().getSubtargetImpl(); - - // This needs to be run directly before register allocation because - // earlier passes might recompute live intervals. - // TODO: handle CodeGenOpt::None; fast RA ignores spill weights set by the pass - if (getOptLevel() > CodeGenOpt::None) { - initializeSIFixControlFlowLiveIntervalsPass(*PassRegistry::getPassRegistry()); - insertPass(&MachineSchedulerID, &SIFixControlFlowLiveIntervalsID); - } - - if (getOptLevel() > CodeGenOpt::None && ST.loadStoreOptEnabled()) { - // Don't do this with no optimizations since it throws away debug info by - // merging nonadjacent loads. - - // This should be run after scheduling, but before register allocation. It - // also need extra copies to the address operand to be eliminated. - initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry()); - insertPass(&MachineSchedulerID, &SILoadStoreOptimizerID); - } - addPass(createSIShrinkInstructionsPass(), false); - addPass(createSIFixSGPRLiveRangesPass(), false); -} - -void GCNPassConfig::addPostRegAlloc() { - addPass(createSIPrepareScratchRegs(), false); - addPass(createSIShrinkInstructionsPass(), false); -} - -void GCNPassConfig::addPreSched2() { - addPass(createSIInsertWaits(*TM), false); -} - -void GCNPassConfig::addPreEmitPass() { - addPass(createSILowerControlFlowPass(*TM), false); -} - -TargetPassConfig *GCNTargetMachine::createPassConfig(PassManagerBase &PM) { - return new GCNPassConfig(this, PM); -} Index: projects/clang370-import/contrib/llvm/lib/Target/R600/SITypeRewriter.cpp =================================================================== --- projects/clang370-import/contrib/llvm/lib/Target/R600/SITypeRewriter.cpp (revision 288077) +++ projects/clang370-import/contrib/llvm/lib/Target/R600/SITypeRewriter.cpp (nonexistent) @@ -1,161 +0,0 @@ -//===-- SITypeRewriter.cpp - Remove unwanted types ------------------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -/// \file -/// This pass removes performs the following type substitution on all -/// non-compute shaders: -/// -/// v16i8 => i128 -/// - v16i8 is used for constant memory resource descriptors. This type is -/// legal for some compute APIs, and we don't want to declare it as legal -/// in the backend, because we want the legalizer to expand all v16i8 -/// operations. -/// v1* => * -/// - Having v1* types complicates the legalizer and we can easily replace -/// - them with the element type. -//===----------------------------------------------------------------------===// - -#include "AMDGPU.h" -#include "llvm/IR/IRBuilder.h" -#include "llvm/IR/InstVisitor.h" - -using namespace llvm; - -namespace { - -class SITypeRewriter : public FunctionPass, - public InstVisitor { - - static char ID; - Module *Mod; - Type *v16i8; - Type *v4i32; - -public: - SITypeRewriter() : FunctionPass(ID) { } - bool doInitialization(Module &M) override; - bool runOnFunction(Function &F) override; - const char *getPassName() const override { - return "SI Type Rewriter"; - } - void visitLoadInst(LoadInst &I); - void visitCallInst(CallInst &I); - void visitBitCast(BitCastInst &I); -}; - -} // End anonymous namespace - -char SITypeRewriter::ID = 0; - -bool SITypeRewriter::doInitialization(Module &M) { - Mod = &M; - v16i8 = VectorType::get(Type::getInt8Ty(M.getContext()), 16); - v4i32 = VectorType::get(Type::getInt32Ty(M.getContext()), 4); - return false; -} - -bool SITypeRewriter::runOnFunction(Function &F) { - Attribute A = F.getFnAttribute("ShaderType"); - - unsigned ShaderType = ShaderType::COMPUTE; - if (A.isStringAttribute()) { - StringRef Str = A.getValueAsString(); - Str.getAsInteger(0, ShaderType); - } - if (ShaderType == ShaderType::COMPUTE) - return false; - - visit(F); - visit(F); - - return false; -} - -void SITypeRewriter::visitLoadInst(LoadInst &I) { - Value *Ptr = I.getPointerOperand(); - Type *PtrTy = Ptr->getType(); - Type *ElemTy = PtrTy->getPointerElementType(); - IRBuilder<> Builder(&I); - if (ElemTy == v16i8) { - Value *BitCast = Builder.CreateBitCast(Ptr, - PointerType::get(v4i32,PtrTy->getPointerAddressSpace())); - LoadInst *Load = Builder.CreateLoad(BitCast); - SmallVector, 8> MD; - I.getAllMetadataOtherThanDebugLoc(MD); - for (unsigned i = 0, e = MD.size(); i != e; ++i) { - Load->setMetadata(MD[i].first, MD[i].second); - } - Value *BitCastLoad = Builder.CreateBitCast(Load, I.getType()); - I.replaceAllUsesWith(BitCastLoad); - I.eraseFromParent(); - } -} - -void SITypeRewriter::visitCallInst(CallInst &I) { - IRBuilder<> Builder(&I); - - SmallVector Args; - SmallVector Types; - bool NeedToReplace = false; - Function *F = I.getCalledFunction(); - std::string Name = F->getName(); - for (unsigned i = 0, e = I.getNumArgOperands(); i != e; ++i) { - Value *Arg = I.getArgOperand(i); - if (Arg->getType() == v16i8) { - Args.push_back(Builder.CreateBitCast(Arg, v4i32)); - Types.push_back(v4i32); - NeedToReplace = true; - Name = Name + ".v4i32"; - } else if (Arg->getType()->isVectorTy() && - Arg->getType()->getVectorNumElements() == 1 && - Arg->getType()->getVectorElementType() == - Type::getInt32Ty(I.getContext())){ - Type *ElementTy = Arg->getType()->getVectorElementType(); - std::string TypeName = "i32"; - InsertElementInst *Def = cast(Arg); - Args.push_back(Def->getOperand(1)); - Types.push_back(ElementTy); - std::string VecTypeName = "v1" + TypeName; - Name = Name.replace(Name.find(VecTypeName), VecTypeName.length(), TypeName); - NeedToReplace = true; - } else { - Args.push_back(Arg); - Types.push_back(Arg->getType()); - } - } - - if (!NeedToReplace) { - return; - } - Function *NewF = Mod->getFunction(Name); - if (!NewF) { - NewF = Function::Create(FunctionType::get(F->getReturnType(), Types, false), GlobalValue::ExternalLinkage, Name, Mod); - NewF->setAttributes(F->getAttributes()); - } - I.replaceAllUsesWith(Builder.CreateCall(NewF, Args)); - I.eraseFromParent(); -} - -void SITypeRewriter::visitBitCast(BitCastInst &I) { - IRBuilder<> Builder(&I); - if (I.getDestTy() != v4i32) { - return; - } - - if (BitCastInst *Op = dyn_cast(I.getOperand(0))) { - if (Op->getSrcTy() == v4i32) { - I.replaceAllUsesWith(Op->getOperand(0)); - I.eraseFromParent(); - } - } -} - -FunctionPass *llvm::createSITypeRewriter() { - return new SITypeRewriter(); -} Index: projects/clang370-import/contrib/llvm/lib/Target/R600/AMDGPUPromoteAlloca.cpp =================================================================== --- projects/clang370-import/contrib/llvm/lib/Target/R600/AMDGPUPromoteAlloca.cpp (revision 288077) +++ projects/clang370-import/contrib/llvm/lib/Target/R600/AMDGPUPromoteAlloca.cpp (nonexistent) @@ -1,407 +0,0 @@ -//===-- AMDGPUPromoteAlloca.cpp - Promote Allocas -------------------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This pass eliminates allocas by either converting them into vectors or -// by migrating them to local address space. -// -//===----------------------------------------------------------------------===// - -#include "AMDGPU.h" -#include "AMDGPUSubtarget.h" -#include "llvm/Analysis/ValueTracking.h" -#include "llvm/IR/IRBuilder.h" -#include "llvm/IR/InstVisitor.h" -#include "llvm/Support/Debug.h" -#include "llvm/Support/raw_ostream.h" - -#define DEBUG_TYPE "amdgpu-promote-alloca" - -using namespace llvm; - -namespace { - -class AMDGPUPromoteAlloca : public FunctionPass, - public InstVisitor { - - static char ID; - Module *Mod; - const AMDGPUSubtarget &ST; - int LocalMemAvailable; - -public: - AMDGPUPromoteAlloca(const AMDGPUSubtarget &st) : FunctionPass(ID), ST(st), - LocalMemAvailable(0) { } - bool doInitialization(Module &M) override; - bool runOnFunction(Function &F) override; - const char *getPassName() const override { return "AMDGPU Promote Alloca"; } - void visitAlloca(AllocaInst &I); -}; - -} // End anonymous namespace - -char AMDGPUPromoteAlloca::ID = 0; - -bool AMDGPUPromoteAlloca::doInitialization(Module &M) { - Mod = &M; - return false; -} - -bool AMDGPUPromoteAlloca::runOnFunction(Function &F) { - - const FunctionType *FTy = F.getFunctionType(); - - LocalMemAvailable = ST.getLocalMemorySize(); - - - // If the function has any arguments in the local address space, then it's - // possible these arguments require the entire local memory space, so - // we cannot use local memory in the pass. - for (unsigned i = 0, e = FTy->getNumParams(); i != e; ++i) { - const Type *ParamTy = FTy->getParamType(i); - if (ParamTy->isPointerTy() && - ParamTy->getPointerAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) { - LocalMemAvailable = 0; - DEBUG(dbgs() << "Function has local memory argument. Promoting to " - "local memory disabled.\n"); - break; - } - } - - if (LocalMemAvailable > 0) { - // Check how much local memory is being used by global objects - for (Module::global_iterator I = Mod->global_begin(), - E = Mod->global_end(); I != E; ++I) { - GlobalVariable *GV = I; - PointerType *GVTy = GV->getType(); - if (GVTy->getAddressSpace() != AMDGPUAS::LOCAL_ADDRESS) - continue; - for (Value::use_iterator U = GV->use_begin(), - UE = GV->use_end(); U != UE; ++U) { - Instruction *Use = dyn_cast(*U); - if (!Use) - continue; - if (Use->getParent()->getParent() == &F) - LocalMemAvailable -= - Mod->getDataLayout().getTypeAllocSize(GVTy->getElementType()); - } - } - } - - LocalMemAvailable = std::max(0, LocalMemAvailable); - DEBUG(dbgs() << LocalMemAvailable << "bytes free in local memory.\n"); - - visit(F); - - return false; -} - -static VectorType *arrayTypeToVecType(const Type *ArrayTy) { - return VectorType::get(ArrayTy->getArrayElementType(), - ArrayTy->getArrayNumElements()); -} - -static Value * -calculateVectorIndex(Value *Ptr, - const std::map &GEPIdx) { - if (isa(Ptr)) - return Constant::getNullValue(Type::getInt32Ty(Ptr->getContext())); - - GetElementPtrInst *GEP = cast(Ptr); - - auto I = GEPIdx.find(GEP); - return I == GEPIdx.end() ? nullptr : I->second; -} - -static Value* GEPToVectorIndex(GetElementPtrInst *GEP) { - // FIXME we only support simple cases - if (GEP->getNumOperands() != 3) - return NULL; - - ConstantInt *I0 = dyn_cast(GEP->getOperand(1)); - if (!I0 || !I0->isZero()) - return NULL; - - return GEP->getOperand(2); -} - -// Not an instruction handled below to turn into a vector. -// -// TODO: Check isTriviallyVectorizable for calls and handle other -// instructions. -static bool canVectorizeInst(Instruction *Inst) { - switch (Inst->getOpcode()) { - case Instruction::Load: - case Instruction::Store: - case Instruction::BitCast: - case Instruction::AddrSpaceCast: - return true; - default: - return false; - } -} - -static bool tryPromoteAllocaToVector(AllocaInst *Alloca) { - Type *AllocaTy = Alloca->getAllocatedType(); - - DEBUG(dbgs() << "Alloca Candidate for vectorization \n"); - - // FIXME: There is no reason why we can't support larger arrays, we - // are just being conservative for now. - if (!AllocaTy->isArrayTy() || - AllocaTy->getArrayElementType()->isVectorTy() || - AllocaTy->getArrayNumElements() > 4) { - - DEBUG(dbgs() << " Cannot convert type to vector"); - return false; - } - - std::map GEPVectorIdx; - std::vector WorkList; - for (User *AllocaUser : Alloca->users()) { - GetElementPtrInst *GEP = dyn_cast(AllocaUser); - if (!GEP) { - if (!canVectorizeInst(cast(AllocaUser))) - return false; - - WorkList.push_back(AllocaUser); - continue; - } - - Value *Index = GEPToVectorIndex(GEP); - - // If we can't compute a vector index from this GEP, then we can't - // promote this alloca to vector. - if (!Index) { - DEBUG(dbgs() << " Cannot compute vector index for GEP " << *GEP << '\n'); - return false; - } - - GEPVectorIdx[GEP] = Index; - for (User *GEPUser : AllocaUser->users()) { - if (!canVectorizeInst(cast(GEPUser))) - return false; - - WorkList.push_back(GEPUser); - } - } - - VectorType *VectorTy = arrayTypeToVecType(AllocaTy); - - DEBUG(dbgs() << " Converting alloca to vector " - << *AllocaTy << " -> " << *VectorTy << '\n'); - - for (std::vector::iterator I = WorkList.begin(), - E = WorkList.end(); I != E; ++I) { - Instruction *Inst = cast(*I); - IRBuilder<> Builder(Inst); - switch (Inst->getOpcode()) { - case Instruction::Load: { - Value *Ptr = Inst->getOperand(0); - Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx); - Value *BitCast = Builder.CreateBitCast(Alloca, VectorTy->getPointerTo(0)); - Value *VecValue = Builder.CreateLoad(BitCast); - Value *ExtractElement = Builder.CreateExtractElement(VecValue, Index); - Inst->replaceAllUsesWith(ExtractElement); - Inst->eraseFromParent(); - break; - } - case Instruction::Store: { - Value *Ptr = Inst->getOperand(1); - Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx); - Value *BitCast = Builder.CreateBitCast(Alloca, VectorTy->getPointerTo(0)); - Value *VecValue = Builder.CreateLoad(BitCast); - Value *NewVecValue = Builder.CreateInsertElement(VecValue, - Inst->getOperand(0), - Index); - Builder.CreateStore(NewVecValue, BitCast); - Inst->eraseFromParent(); - break; - } - case Instruction::BitCast: - case Instruction::AddrSpaceCast: - break; - - default: - Inst->dump(); - llvm_unreachable("Inconsistency in instructions promotable to vector"); - } - } - return true; -} - -static bool collectUsesWithPtrTypes(Value *Val, std::vector &WorkList) { - bool Success = true; - for (User *User : Val->users()) { - if(std::find(WorkList.begin(), WorkList.end(), User) != WorkList.end()) - continue; - if (isa(User)) { - WorkList.push_back(User); - continue; - } - - // FIXME: Correctly handle ptrtoint instructions. - Instruction *UseInst = dyn_cast(User); - if (UseInst && UseInst->getOpcode() == Instruction::PtrToInt) - return false; - - if (!User->getType()->isPointerTy()) - continue; - - WorkList.push_back(User); - - Success &= collectUsesWithPtrTypes(User, WorkList); - } - return Success; -} - -void AMDGPUPromoteAlloca::visitAlloca(AllocaInst &I) { - IRBuilder<> Builder(&I); - - // First try to replace the alloca with a vector - Type *AllocaTy = I.getAllocatedType(); - - DEBUG(dbgs() << "Trying to promote " << I << '\n'); - - if (tryPromoteAllocaToVector(&I)) - return; - - DEBUG(dbgs() << " alloca is not a candidate for vectorization.\n"); - - // FIXME: This is the maximum work group size. We should try to get - // value from the reqd_work_group_size function attribute if it is - // available. - unsigned WorkGroupSize = 256; - int AllocaSize = - WorkGroupSize * Mod->getDataLayout().getTypeAllocSize(AllocaTy); - - if (AllocaSize > LocalMemAvailable) { - DEBUG(dbgs() << " Not enough local memory to promote alloca.\n"); - return; - } - - std::vector WorkList; - - if (!collectUsesWithPtrTypes(&I, WorkList)) { - DEBUG(dbgs() << " Do not know how to convert all uses\n"); - return; - } - - DEBUG(dbgs() << "Promoting alloca to local memory\n"); - LocalMemAvailable -= AllocaSize; - - Type *GVTy = ArrayType::get(I.getAllocatedType(), 256); - GlobalVariable *GV = new GlobalVariable( - *Mod, GVTy, false, GlobalValue::ExternalLinkage, 0, I.getName(), 0, - GlobalVariable::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS); - - FunctionType *FTy = FunctionType::get( - Type::getInt32Ty(Mod->getContext()), false); - AttributeSet AttrSet; - AttrSet.addAttribute(Mod->getContext(), 0, Attribute::ReadNone); - - Value *ReadLocalSizeY = Mod->getOrInsertFunction( - "llvm.r600.read.local.size.y", FTy, AttrSet); - Value *ReadLocalSizeZ = Mod->getOrInsertFunction( - "llvm.r600.read.local.size.z", FTy, AttrSet); - Value *ReadTIDIGX = Mod->getOrInsertFunction( - "llvm.r600.read.tidig.x", FTy, AttrSet); - Value *ReadTIDIGY = Mod->getOrInsertFunction( - "llvm.r600.read.tidig.y", FTy, AttrSet); - Value *ReadTIDIGZ = Mod->getOrInsertFunction( - "llvm.r600.read.tidig.z", FTy, AttrSet); - - Value *TCntY = Builder.CreateCall(ReadLocalSizeY, {}); - Value *TCntZ = Builder.CreateCall(ReadLocalSizeZ, {}); - Value *TIdX = Builder.CreateCall(ReadTIDIGX, {}); - Value *TIdY = Builder.CreateCall(ReadTIDIGY, {}); - Value *TIdZ = Builder.CreateCall(ReadTIDIGZ, {}); - - Value *Tmp0 = Builder.CreateMul(TCntY, TCntZ); - Tmp0 = Builder.CreateMul(Tmp0, TIdX); - Value *Tmp1 = Builder.CreateMul(TIdY, TCntZ); - Value *TID = Builder.CreateAdd(Tmp0, Tmp1); - TID = Builder.CreateAdd(TID, TIdZ); - - std::vector Indices; - Indices.push_back(Constant::getNullValue(Type::getInt32Ty(Mod->getContext()))); - Indices.push_back(TID); - - Value *Offset = Builder.CreateGEP(GVTy, GV, Indices); - I.mutateType(Offset->getType()); - I.replaceAllUsesWith(Offset); - I.eraseFromParent(); - - for (std::vector::iterator i = WorkList.begin(), - e = WorkList.end(); i != e; ++i) { - Value *V = *i; - CallInst *Call = dyn_cast(V); - if (!Call) { - Type *EltTy = V->getType()->getPointerElementType(); - PointerType *NewTy = PointerType::get(EltTy, AMDGPUAS::LOCAL_ADDRESS); - - // The operand's value should be corrected on its own. - if (isa(V)) - continue; - - // FIXME: It doesn't really make sense to try to do this for all - // instructions. - V->mutateType(NewTy); - continue; - } - - IntrinsicInst *Intr = dyn_cast(Call); - if (!Intr) { - std::vector ArgTypes; - for (unsigned ArgIdx = 0, ArgEnd = Call->getNumArgOperands(); - ArgIdx != ArgEnd; ++ArgIdx) { - ArgTypes.push_back(Call->getArgOperand(ArgIdx)->getType()); - } - Function *F = Call->getCalledFunction(); - FunctionType *NewType = FunctionType::get(Call->getType(), ArgTypes, - F->isVarArg()); - Constant *C = Mod->getOrInsertFunction((F->getName() + ".local").str(), - NewType, F->getAttributes()); - Function *NewF = cast(C); - Call->setCalledFunction(NewF); - continue; - } - - Builder.SetInsertPoint(Intr); - switch (Intr->getIntrinsicID()) { - case Intrinsic::lifetime_start: - case Intrinsic::lifetime_end: - // These intrinsics are for address space 0 only - Intr->eraseFromParent(); - continue; - case Intrinsic::memcpy: { - MemCpyInst *MemCpy = cast(Intr); - Builder.CreateMemCpy(MemCpy->getRawDest(), MemCpy->getRawSource(), - MemCpy->getLength(), MemCpy->getAlignment(), - MemCpy->isVolatile()); - Intr->eraseFromParent(); - continue; - } - case Intrinsic::memset: { - MemSetInst *MemSet = cast(Intr); - Builder.CreateMemSet(MemSet->getRawDest(), MemSet->getValue(), - MemSet->getLength(), MemSet->getAlignment(), - MemSet->isVolatile()); - Intr->eraseFromParent(); - continue; - } - default: - Intr->dump(); - llvm_unreachable("Don't know how to promote alloca intrinsic use."); - } - } -} - -FunctionPass *llvm::createAMDGPUPromoteAlloca(const AMDGPUSubtarget &ST) { - return new AMDGPUPromoteAlloca(ST); -} Index: projects/clang370-import/contrib/llvm/lib/Target/R600/R600Packetizer.cpp =================================================================== --- projects/clang370-import/contrib/llvm/lib/Target/R600/R600Packetizer.cpp (revision 288077) +++ projects/clang370-import/contrib/llvm/lib/Target/R600/R600Packetizer.cpp (nonexistent) @@ -1,408 +0,0 @@ -//===----- R600Packetizer.cpp - VLIW packetizer ---------------------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -/// \file -/// This pass implements instructions packetization for R600. It unsets isLast -/// bit of instructions inside a bundle and substitutes src register with -/// PreviousVector when applicable. -// -//===----------------------------------------------------------------------===// - -#include "llvm/Support/Debug.h" -#include "AMDGPU.h" -#include "AMDGPUSubtarget.h" -#include "R600InstrInfo.h" -#include "llvm/CodeGen/DFAPacketizer.h" -#include "llvm/CodeGen/MachineDominators.h" -#include "llvm/CodeGen/MachineFunctionPass.h" -#include "llvm/CodeGen/MachineLoopInfo.h" -#include "llvm/CodeGen/Passes.h" -#include "llvm/CodeGen/ScheduleDAG.h" -#include "llvm/Support/raw_ostream.h" - -using namespace llvm; - -#define DEBUG_TYPE "packets" - -namespace { - -class R600Packetizer : public MachineFunctionPass { - -public: - static char ID; - R600Packetizer(const TargetMachine &TM) : MachineFunctionPass(ID) {} - - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.setPreservesCFG(); - AU.addRequired(); - AU.addPreserved(); - AU.addRequired(); - AU.addPreserved(); - MachineFunctionPass::getAnalysisUsage(AU); - } - - const char *getPassName() const override { - return "R600 Packetizer"; - } - - bool runOnMachineFunction(MachineFunction &Fn) override; -}; -char R600Packetizer::ID = 0; - -class R600PacketizerList : public VLIWPacketizerList { - -private: - const R600InstrInfo *TII; - const R600RegisterInfo &TRI; - bool VLIW5; - bool ConsideredInstUsesAlreadyWrittenVectorElement; - - unsigned getSlot(const MachineInstr *MI) const { - return TRI.getHWRegChan(MI->getOperand(0).getReg()); - } - - /// \returns register to PV chan mapping for bundle/single instructions that - /// immediately precedes I. - DenseMap getPreviousVector(MachineBasicBlock::iterator I) - const { - DenseMap Result; - I--; - if (!TII->isALUInstr(I->getOpcode()) && !I->isBundle()) - return Result; - MachineBasicBlock::instr_iterator BI = I.getInstrIterator(); - if (I->isBundle()) - BI++; - int LastDstChan = -1; - do { - bool isTrans = false; - int BISlot = getSlot(BI); - if (LastDstChan >= BISlot) - isTrans = true; - LastDstChan = BISlot; - if (TII->isPredicated(BI)) - continue; - int OperandIdx = TII->getOperandIdx(BI->getOpcode(), AMDGPU::OpName::write); - if (OperandIdx > -1 && BI->getOperand(OperandIdx).getImm() == 0) - continue; - int DstIdx = TII->getOperandIdx(BI->getOpcode(), AMDGPU::OpName::dst); - if (DstIdx == -1) { - continue; - } - unsigned Dst = BI->getOperand(DstIdx).getReg(); - if (isTrans || TII->isTransOnly(BI)) { - Result[Dst] = AMDGPU::PS; - continue; - } - if (BI->getOpcode() == AMDGPU::DOT4_r600 || - BI->getOpcode() == AMDGPU::DOT4_eg) { - Result[Dst] = AMDGPU::PV_X; - continue; - } - if (Dst == AMDGPU::OQAP) { - continue; - } - unsigned PVReg = 0; - switch (TRI.getHWRegChan(Dst)) { - case 0: - PVReg = AMDGPU::PV_X; - break; - case 1: - PVReg = AMDGPU::PV_Y; - break; - case 2: - PVReg = AMDGPU::PV_Z; - break; - case 3: - PVReg = AMDGPU::PV_W; - break; - default: - llvm_unreachable("Invalid Chan"); - } - Result[Dst] = PVReg; - } while ((++BI)->isBundledWithPred()); - return Result; - } - - void substitutePV(MachineInstr *MI, const DenseMap &PVs) - const { - unsigned Ops[] = { - AMDGPU::OpName::src0, - AMDGPU::OpName::src1, - AMDGPU::OpName::src2 - }; - for (unsigned i = 0; i < 3; i++) { - int OperandIdx = TII->getOperandIdx(MI->getOpcode(), Ops[i]); - if (OperandIdx < 0) - continue; - unsigned Src = MI->getOperand(OperandIdx).getReg(); - const DenseMap::const_iterator It = PVs.find(Src); - if (It != PVs.end()) - MI->getOperand(OperandIdx).setReg(It->second); - } - } -public: - // Ctor. - R600PacketizerList(MachineFunction &MF, MachineLoopInfo &MLI) - : VLIWPacketizerList(MF, MLI, true), - TII(static_cast( - MF.getSubtarget().getInstrInfo())), - TRI(TII->getRegisterInfo()) { - VLIW5 = !MF.getSubtarget().hasCaymanISA(); - } - - // initPacketizerState - initialize some internal flags. - void initPacketizerState() override { - ConsideredInstUsesAlreadyWrittenVectorElement = false; - } - - // ignorePseudoInstruction - Ignore bundling of pseudo instructions. - bool ignorePseudoInstruction(MachineInstr *MI, - MachineBasicBlock *MBB) override { - return false; - } - - // isSoloInstruction - return true if instruction MI can not be packetized - // with any other instruction, which means that MI itself is a packet. - bool isSoloInstruction(MachineInstr *MI) override { - if (TII->isVector(*MI)) - return true; - if (!TII->isALUInstr(MI->getOpcode())) - return true; - if (MI->getOpcode() == AMDGPU::GROUP_BARRIER) - return true; - // XXX: This can be removed once the packetizer properly handles all the - // LDS instruction group restrictions. - if (TII->isLDSInstr(MI->getOpcode())) - return true; - return false; - } - - // isLegalToPacketizeTogether - Is it legal to packetize SUI and SUJ - // together. - bool isLegalToPacketizeTogether(SUnit *SUI, SUnit *SUJ) override { - MachineInstr *MII = SUI->getInstr(), *MIJ = SUJ->getInstr(); - if (getSlot(MII) == getSlot(MIJ)) - ConsideredInstUsesAlreadyWrittenVectorElement = true; - // Does MII and MIJ share the same pred_sel ? - int OpI = TII->getOperandIdx(MII->getOpcode(), AMDGPU::OpName::pred_sel), - OpJ = TII->getOperandIdx(MIJ->getOpcode(), AMDGPU::OpName::pred_sel); - unsigned PredI = (OpI > -1)?MII->getOperand(OpI).getReg():0, - PredJ = (OpJ > -1)?MIJ->getOperand(OpJ).getReg():0; - if (PredI != PredJ) - return false; - if (SUJ->isSucc(SUI)) { - for (unsigned i = 0, e = SUJ->Succs.size(); i < e; ++i) { - const SDep &Dep = SUJ->Succs[i]; - if (Dep.getSUnit() != SUI) - continue; - if (Dep.getKind() == SDep::Anti) - continue; - if (Dep.getKind() == SDep::Output) - if (MII->getOperand(0).getReg() != MIJ->getOperand(0).getReg()) - continue; - return false; - } - } - - bool ARDef = TII->definesAddressRegister(MII) || - TII->definesAddressRegister(MIJ); - bool ARUse = TII->usesAddressRegister(MII) || - TII->usesAddressRegister(MIJ); - if (ARDef && ARUse) - return false; - - return true; - } - - // isLegalToPruneDependencies - Is it legal to prune dependece between SUI - // and SUJ. - bool isLegalToPruneDependencies(SUnit *SUI, SUnit *SUJ) override { - return false; - } - - void setIsLastBit(MachineInstr *MI, unsigned Bit) const { - unsigned LastOp = TII->getOperandIdx(MI->getOpcode(), AMDGPU::OpName::last); - MI->getOperand(LastOp).setImm(Bit); - } - - bool isBundlableWithCurrentPMI(MachineInstr *MI, - const DenseMap &PV, - std::vector &BS, - bool &isTransSlot) { - isTransSlot = TII->isTransOnly(MI); - assert (!isTransSlot || VLIW5); - - // Is the dst reg sequence legal ? - if (!isTransSlot && !CurrentPacketMIs.empty()) { - if (getSlot(MI) <= getSlot(CurrentPacketMIs.back())) { - if (ConsideredInstUsesAlreadyWrittenVectorElement && - !TII->isVectorOnly(MI) && VLIW5) { - isTransSlot = true; - DEBUG(dbgs() << "Considering as Trans Inst :"; MI->dump();); - } - else - return false; - } - } - - // Are the Constants limitations met ? - CurrentPacketMIs.push_back(MI); - if (!TII->fitsConstReadLimitations(CurrentPacketMIs)) { - DEBUG( - dbgs() << "Couldn't pack :\n"; - MI->dump(); - dbgs() << "with the following packets :\n"; - for (unsigned i = 0, e = CurrentPacketMIs.size() - 1; i < e; i++) { - CurrentPacketMIs[i]->dump(); - dbgs() << "\n"; - } - dbgs() << "because of Consts read limitations\n"; - ); - CurrentPacketMIs.pop_back(); - return false; - } - - // Is there a BankSwizzle set that meet Read Port limitations ? - if (!TII->fitsReadPortLimitations(CurrentPacketMIs, - PV, BS, isTransSlot)) { - DEBUG( - dbgs() << "Couldn't pack :\n"; - MI->dump(); - dbgs() << "with the following packets :\n"; - for (unsigned i = 0, e = CurrentPacketMIs.size() - 1; i < e; i++) { - CurrentPacketMIs[i]->dump(); - dbgs() << "\n"; - } - dbgs() << "because of Read port limitations\n"; - ); - CurrentPacketMIs.pop_back(); - return false; - } - - // We cannot read LDS source registrs from the Trans slot. - if (isTransSlot && TII->readsLDSSrcReg(MI)) - return false; - - CurrentPacketMIs.pop_back(); - return true; - } - - MachineBasicBlock::iterator addToPacket(MachineInstr *MI) override { - MachineBasicBlock::iterator FirstInBundle = - CurrentPacketMIs.empty() ? MI : CurrentPacketMIs.front(); - const DenseMap &PV = - getPreviousVector(FirstInBundle); - std::vector BS; - bool isTransSlot; - - if (isBundlableWithCurrentPMI(MI, PV, BS, isTransSlot)) { - for (unsigned i = 0, e = CurrentPacketMIs.size(); i < e; i++) { - MachineInstr *MI = CurrentPacketMIs[i]; - unsigned Op = TII->getOperandIdx(MI->getOpcode(), - AMDGPU::OpName::bank_swizzle); - MI->getOperand(Op).setImm(BS[i]); - } - unsigned Op = TII->getOperandIdx(MI->getOpcode(), - AMDGPU::OpName::bank_swizzle); - MI->getOperand(Op).setImm(BS.back()); - if (!CurrentPacketMIs.empty()) - setIsLastBit(CurrentPacketMIs.back(), 0); - substitutePV(MI, PV); - MachineBasicBlock::iterator It = VLIWPacketizerList::addToPacket(MI); - if (isTransSlot) { - endPacket(std::next(It)->getParent(), std::next(It)); - } - return It; - } - endPacket(MI->getParent(), MI); - if (TII->isTransOnly(MI)) - return MI; - return VLIWPacketizerList::addToPacket(MI); - } -}; - -bool R600Packetizer::runOnMachineFunction(MachineFunction &Fn) { - const TargetInstrInfo *TII = Fn.getSubtarget().getInstrInfo(); - MachineLoopInfo &MLI = getAnalysis(); - - // Instantiate the packetizer. - R600PacketizerList Packetizer(Fn, MLI); - - // DFA state table should not be empty. - assert(Packetizer.getResourceTracker() && "Empty DFA table!"); - - // - // Loop over all basic blocks and remove KILL pseudo-instructions - // These instructions confuse the dependence analysis. Consider: - // D0 = ... (Insn 0) - // R0 = KILL R0, D0 (Insn 1) - // R0 = ... (Insn 2) - // Here, Insn 1 will result in the dependence graph not emitting an output - // dependence between Insn 0 and Insn 2. This can lead to incorrect - // packetization - // - for (MachineFunction::iterator MBB = Fn.begin(), MBBe = Fn.end(); - MBB != MBBe; ++MBB) { - MachineBasicBlock::iterator End = MBB->end(); - MachineBasicBlock::iterator MI = MBB->begin(); - while (MI != End) { - if (MI->isKill() || MI->getOpcode() == AMDGPU::IMPLICIT_DEF || - (MI->getOpcode() == AMDGPU::CF_ALU && !MI->getOperand(8).getImm())) { - MachineBasicBlock::iterator DeleteMI = MI; - ++MI; - MBB->erase(DeleteMI); - End = MBB->end(); - continue; - } - ++MI; - } - } - - // Loop over all of the basic blocks. - for (MachineFunction::iterator MBB = Fn.begin(), MBBe = Fn.end(); - MBB != MBBe; ++MBB) { - // Find scheduling regions and schedule / packetize each region. - unsigned RemainingCount = MBB->size(); - for(MachineBasicBlock::iterator RegionEnd = MBB->end(); - RegionEnd != MBB->begin();) { - // The next region starts above the previous region. Look backward in the - // instruction stream until we find the nearest boundary. - MachineBasicBlock::iterator I = RegionEnd; - for(;I != MBB->begin(); --I, --RemainingCount) { - if (TII->isSchedulingBoundary(std::prev(I), MBB, Fn)) - break; - } - I = MBB->begin(); - - // Skip empty scheduling regions. - if (I == RegionEnd) { - RegionEnd = std::prev(RegionEnd); - --RemainingCount; - continue; - } - // Skip regions with one instruction. - if (I == std::prev(RegionEnd)) { - RegionEnd = std::prev(RegionEnd); - continue; - } - - Packetizer.PacketizeMIs(MBB, I, RegionEnd); - RegionEnd = I; - } - } - - return true; - -} - -} // end anonymous namespace - -llvm::FunctionPass *llvm::createR600Packetizer(TargetMachine &tm) { - return new R600Packetizer(tm); -} Index: projects/clang370-import/contrib/llvm/lib/Target/R600/AMDGPURegisterInfo.cpp =================================================================== --- projects/clang370-import/contrib/llvm/lib/Target/R600/AMDGPURegisterInfo.cpp (revision 288077) +++ projects/clang370-import/contrib/llvm/lib/Target/R600/AMDGPURegisterInfo.cpp (nonexistent) @@ -1,63 +0,0 @@ -//===-- AMDGPURegisterInfo.cpp - AMDGPU Register Information -------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -/// \file -/// \brief Parent TargetRegisterInfo class common to all hw codegen targets. -// -//===----------------------------------------------------------------------===// - -#include "AMDGPURegisterInfo.h" -#include "AMDGPUTargetMachine.h" - -using namespace llvm; - -AMDGPURegisterInfo::AMDGPURegisterInfo() : AMDGPUGenRegisterInfo(0) {} - -//===----------------------------------------------------------------------===// -// Function handling callbacks - Functions are a seldom used feature of GPUS, so -// they are not supported at this time. -//===----------------------------------------------------------------------===// - -const MCPhysReg AMDGPURegisterInfo::CalleeSavedReg = AMDGPU::NoRegister; - -const MCPhysReg* -AMDGPURegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { - return &CalleeSavedReg; -} - -void AMDGPURegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, - int SPAdj, - unsigned FIOperandNum, - RegScavenger *RS) const { - llvm_unreachable("Subroutines not supported yet"); -} - -unsigned AMDGPURegisterInfo::getFrameRegister(const MachineFunction &MF) const { - return AMDGPU::NoRegister; -} - -unsigned AMDGPURegisterInfo::getSubRegFromChannel(unsigned Channel) const { - static const unsigned SubRegs[] = { - AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, AMDGPU::sub4, - AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, AMDGPU::sub8, AMDGPU::sub9, - AMDGPU::sub10, AMDGPU::sub11, AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, - AMDGPU::sub15 - }; - - assert(Channel < array_lengthof(SubRegs)); - return SubRegs[Channel]; -} - -unsigned AMDGPURegisterInfo::getIndirectSubReg(unsigned IndirectIndex) const { - - return getSubRegFromChannel(IndirectIndex); -} - -#define GET_REGINFO_TARGET_DESC -#include "AMDGPUGenRegisterInfo.inc" Index: projects/clang370-import/contrib/llvm/lib/Target/R600/AMDGPUInstructions.td =================================================================== --- projects/clang370-import/contrib/llvm/lib/Target/R600/AMDGPUInstructions.td (revision 288077) +++ projects/clang370-import/contrib/llvm/lib/Target/R600/AMDGPUInstructions.td (nonexistent) @@ -1,682 +0,0 @@ -//===-- AMDGPUInstructions.td - Common instruction defs ---*- tablegen -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file contains instruction defs that are common to all hw codegen -// targets. -// -//===----------------------------------------------------------------------===// - -class AMDGPUInst pattern> : Instruction { - field bit isRegisterLoad = 0; - field bit isRegisterStore = 0; - - let Namespace = "AMDGPU"; - let OutOperandList = outs; - let InOperandList = ins; - let AsmString = asm; - let Pattern = pattern; - let Itinerary = NullALU; - - let TSFlags{63} = isRegisterLoad; - let TSFlags{62} = isRegisterStore; -} - -class AMDGPUShaderInst pattern> - : AMDGPUInst { - - field bits<32> Inst = 0xffffffff; - -} - -def FP32Denormals : Predicate<"Subtarget.hasFP32Denormals()">; -def FP64Denormals : Predicate<"Subtarget.hasFP64Denormals()">; -def UnsafeFPMath : Predicate<"TM.Options.UnsafeFPMath">; - -def InstFlag : OperandWithDefaultOps ; -def ADDRIndirect : ComplexPattern; - -let OperandType = "OPERAND_IMMEDIATE" in { - -def u32imm : Operand { - let PrintMethod = "printU32ImmOperand"; -} - -def u16imm : Operand { - let PrintMethod = "printU16ImmOperand"; -} - -def u8imm : Operand { - let PrintMethod = "printU8ImmOperand"; -} - -} // End OperandType = "OPERAND_IMMEDIATE" - -//===--------------------------------------------------------------------===// -// Custom Operands -//===--------------------------------------------------------------------===// -def brtarget : Operand; - -//===----------------------------------------------------------------------===// -// PatLeafs for floating-point comparisons -//===----------------------------------------------------------------------===// - -def COND_OEQ : PatLeaf < - (cond), - [{return N->get() == ISD::SETOEQ || N->get() == ISD::SETEQ;}] ->; - -def COND_ONE : PatLeaf < - (cond), - [{return N->get() == ISD::SETONE || N->get() == ISD::SETNE;}] ->; - -def COND_OGT : PatLeaf < - (cond), - [{return N->get() == ISD::SETOGT || N->get() == ISD::SETGT;}] ->; - -def COND_OGE : PatLeaf < - (cond), - [{return N->get() == ISD::SETOGE || N->get() == ISD::SETGE;}] ->; - -def COND_OLT : PatLeaf < - (cond), - [{return N->get() == ISD::SETOLT || N->get() == ISD::SETLT;}] ->; - -def COND_OLE : PatLeaf < - (cond), - [{return N->get() == ISD::SETOLE || N->get() == ISD::SETLE;}] ->; - - -def COND_O : PatLeaf <(cond), [{return N->get() == ISD::SETO;}]>; -def COND_UO : PatLeaf <(cond), [{return N->get() == ISD::SETUO;}]>; - -//===----------------------------------------------------------------------===// -// PatLeafs for unsigned / unordered comparisons -//===----------------------------------------------------------------------===// - -def COND_UEQ : PatLeaf <(cond), [{return N->get() == ISD::SETUEQ;}]>; -def COND_UNE : PatLeaf <(cond), [{return N->get() == ISD::SETUNE;}]>; -def COND_UGT : PatLeaf <(cond), [{return N->get() == ISD::SETUGT;}]>; -def COND_UGE : PatLeaf <(cond), [{return N->get() == ISD::SETUGE;}]>; -def COND_ULT : PatLeaf <(cond), [{return N->get() == ISD::SETULT;}]>; -def COND_ULE : PatLeaf <(cond), [{return N->get() == ISD::SETULE;}]>; - -// XXX - For some reason R600 version is preferring to use unordered -// for setne? -def COND_UNE_NE : PatLeaf < - (cond), - [{return N->get() == ISD::SETUNE || N->get() == ISD::SETNE;}] ->; - -//===----------------------------------------------------------------------===// -// PatLeafs for signed comparisons -//===----------------------------------------------------------------------===// - -def COND_SGT : PatLeaf <(cond), [{return N->get() == ISD::SETGT;}]>; -def COND_SGE : PatLeaf <(cond), [{return N->get() == ISD::SETGE;}]>; -def COND_SLT : PatLeaf <(cond), [{return N->get() == ISD::SETLT;}]>; -def COND_SLE : PatLeaf <(cond), [{return N->get() == ISD::SETLE;}]>; - -//===----------------------------------------------------------------------===// -// PatLeafs for integer equality -//===----------------------------------------------------------------------===// - -def COND_EQ : PatLeaf < - (cond), - [{return N->get() == ISD::SETEQ || N->get() == ISD::SETUEQ;}] ->; - -def COND_NE : PatLeaf < - (cond), - [{return N->get() == ISD::SETNE || N->get() == ISD::SETUNE;}] ->; - -def COND_NULL : PatLeaf < - (cond), - [{(void)N; return false;}] ->; - -//===----------------------------------------------------------------------===// -// Load/Store Pattern Fragments -//===----------------------------------------------------------------------===// - -class PrivateMemOp : PatFrag (N)->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS; -}]>; - -class PrivateLoad : PrivateMemOp < - (ops node:$ptr), (op node:$ptr) ->; - -class PrivateStore : PrivateMemOp < - (ops node:$value, node:$ptr), (op node:$value, node:$ptr) ->; - -def load_private : PrivateLoad ; - -def truncstorei8_private : PrivateStore ; -def truncstorei16_private : PrivateStore ; -def store_private : PrivateStore ; - -def global_store : PatFrag<(ops node:$val, node:$ptr), - (store node:$val, node:$ptr), [{ - return isGlobalStore(dyn_cast(N)); -}]>; - -// Global address space loads -def global_load : PatFrag<(ops node:$ptr), (load node:$ptr), [{ - return isGlobalLoad(dyn_cast(N)); -}]>; - -// Constant address space loads -def constant_load : PatFrag<(ops node:$ptr), (load node:$ptr), [{ - return isConstantLoad(dyn_cast(N), -1); -}]>; - -class AZExtLoadBase : PatFrag<(ops node:$ptr), - (ld_node node:$ptr), [{ - LoadSDNode *L = cast(N); - return L->getExtensionType() == ISD::ZEXTLOAD || - L->getExtensionType() == ISD::EXTLOAD; -}]>; - -def az_extload : AZExtLoadBase ; - -def az_extloadi8 : PatFrag<(ops node:$ptr), (az_extload node:$ptr), [{ - return cast(N)->getMemoryVT() == MVT::i8; -}]>; - -def az_extloadi8_global : PatFrag<(ops node:$ptr), (az_extloadi8 node:$ptr), [{ - return isGlobalLoad(dyn_cast(N)); -}]>; - -def sextloadi8_global : PatFrag<(ops node:$ptr), (sextloadi8 node:$ptr), [{ - return isGlobalLoad(dyn_cast(N)); -}]>; - -def az_extloadi8_flat : PatFrag<(ops node:$ptr), (az_extloadi8 node:$ptr), [{ - return isFlatLoad(dyn_cast(N)); -}]>; - -def sextloadi8_flat : PatFrag<(ops node:$ptr), (sextloadi8 node:$ptr), [{ - return isFlatLoad(dyn_cast(N)); -}]>; - -def az_extloadi8_constant : PatFrag<(ops node:$ptr), (az_extloadi8 node:$ptr), [{ - return isConstantLoad(dyn_cast(N), -1); -}]>; - -def sextloadi8_constant : PatFrag<(ops node:$ptr), (sextloadi8 node:$ptr), [{ - return isConstantLoad(dyn_cast(N), -1); -}]>; - -def az_extloadi8_local : PatFrag<(ops node:$ptr), (az_extloadi8 node:$ptr), [{ - return isLocalLoad(dyn_cast(N)); -}]>; - -def sextloadi8_local : PatFrag<(ops node:$ptr), (sextloadi8 node:$ptr), [{ - return isLocalLoad(dyn_cast(N)); -}]>; - -def extloadi8_private : PrivateLoad ; -def sextloadi8_private : PrivateLoad ; - -def az_extloadi16 : PatFrag<(ops node:$ptr), (az_extload node:$ptr), [{ - return cast(N)->getMemoryVT() == MVT::i16; -}]>; - -def az_extloadi16_global : PatFrag<(ops node:$ptr), (az_extloadi16 node:$ptr), [{ - return isGlobalLoad(dyn_cast(N)); -}]>; - -def sextloadi16_global : PatFrag<(ops node:$ptr), (sextloadi16 node:$ptr), [{ - return isGlobalLoad(dyn_cast(N)); -}]>; - -def az_extloadi16_flat : PatFrag<(ops node:$ptr), (az_extloadi16 node:$ptr), [{ - return isFlatLoad(dyn_cast(N)); -}]>; - -def sextloadi16_flat : PatFrag<(ops node:$ptr), (sextloadi16 node:$ptr), [{ - return isFlatLoad(dyn_cast(N)); -}]>; - -def az_extloadi16_constant : PatFrag<(ops node:$ptr), (az_extloadi16 node:$ptr), [{ - return isConstantLoad(dyn_cast(N), -1); -}]>; - -def sextloadi16_constant : PatFrag<(ops node:$ptr), (sextloadi16 node:$ptr), [{ - return isConstantLoad(dyn_cast(N), -1); -}]>; - -def az_extloadi16_local : PatFrag<(ops node:$ptr), (az_extloadi16 node:$ptr), [{ - return isLocalLoad(dyn_cast(N)); -}]>; - -def sextloadi16_local : PatFrag<(ops node:$ptr), (sextloadi16 node:$ptr), [{ - return isLocalLoad(dyn_cast(N)); -}]>; - -def extloadi16_private : PrivateLoad ; -def sextloadi16_private : PrivateLoad ; - -def az_extloadi32 : PatFrag<(ops node:$ptr), (az_extload node:$ptr), [{ - return cast(N)->getMemoryVT() == MVT::i32; -}]>; - -def az_extloadi32_global : PatFrag<(ops node:$ptr), - (az_extloadi32 node:$ptr), [{ - return isGlobalLoad(dyn_cast(N)); -}]>; - -def az_extloadi32_flat : PatFrag<(ops node:$ptr), - (az_extloadi32 node:$ptr), [{ - return isFlatLoad(dyn_cast(N)); -}]>; - -def az_extloadi32_constant : PatFrag<(ops node:$ptr), - (az_extloadi32 node:$ptr), [{ - return isConstantLoad(dyn_cast(N), -1); -}]>; - -def truncstorei8_global : PatFrag<(ops node:$val, node:$ptr), - (truncstorei8 node:$val, node:$ptr), [{ - return isGlobalStore(dyn_cast(N)); -}]>; - -def truncstorei16_global : PatFrag<(ops node:$val, node:$ptr), - (truncstorei16 node:$val, node:$ptr), [{ - return isGlobalStore(dyn_cast(N)); -}]>; - -def truncstorei8_flat : PatFrag<(ops node:$val, node:$ptr), - (truncstorei8 node:$val, node:$ptr), [{ - return isFlatStore(dyn_cast(N)); -}]>; - -def truncstorei16_flat : PatFrag<(ops node:$val, node:$ptr), - (truncstorei16 node:$val, node:$ptr), [{ - return isFlatStore(dyn_cast(N)); -}]>; - -def local_store : PatFrag<(ops node:$val, node:$ptr), - (store node:$val, node:$ptr), [{ - return isLocalStore(dyn_cast(N)); -}]>; - -def truncstorei8_local : PatFrag<(ops node:$val, node:$ptr), - (truncstorei8 node:$val, node:$ptr), [{ - return isLocalStore(dyn_cast(N)); -}]>; - -def truncstorei16_local : PatFrag<(ops node:$val, node:$ptr), - (truncstorei16 node:$val, node:$ptr), [{ - return isLocalStore(dyn_cast(N)); -}]>; - -def local_load : PatFrag<(ops node:$ptr), (load node:$ptr), [{ - return isLocalLoad(dyn_cast(N)); -}]>; - -class Aligned8Bytes : PatFrag (N)->getAlignment() % 8 == 0; -}]>; - -def local_load_aligned8bytes : Aligned8Bytes < - (ops node:$ptr), (local_load node:$ptr) ->; - -def local_store_aligned8bytes : Aligned8Bytes < - (ops node:$val, node:$ptr), (local_store node:$val, node:$ptr) ->; - -class local_binary_atomic_op : - PatFrag<(ops node:$ptr, node:$value), - (atomic_op node:$ptr, node:$value), [{ - return cast(N)->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS; -}]>; - - -def atomic_swap_local : local_binary_atomic_op; -def atomic_load_add_local : local_binary_atomic_op; -def atomic_load_sub_local : local_binary_atomic_op; -def atomic_load_and_local : local_binary_atomic_op; -def atomic_load_or_local : local_binary_atomic_op; -def atomic_load_xor_local : local_binary_atomic_op; -def atomic_load_nand_local : local_binary_atomic_op; -def atomic_load_min_local : local_binary_atomic_op; -def atomic_load_max_local : local_binary_atomic_op; -def atomic_load_umin_local : local_binary_atomic_op; -def atomic_load_umax_local : local_binary_atomic_op; - -def mskor_global : PatFrag<(ops node:$val, node:$ptr), - (AMDGPUstore_mskor node:$val, node:$ptr), [{ - return cast(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS; -}]>; - -multiclass AtomicCmpSwapLocal { - - def _32_local : PatFrag < - (ops node:$ptr, node:$cmp, node:$swap), - (cmp_swap_node node:$ptr, node:$cmp, node:$swap), [{ - AtomicSDNode *AN = cast(N); - return AN->getMemoryVT() == MVT::i32 && - AN->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS; - }]>; - - def _64_local : PatFrag< - (ops node:$ptr, node:$cmp, node:$swap), - (cmp_swap_node node:$ptr, node:$cmp, node:$swap), [{ - AtomicSDNode *AN = cast(N); - return AN->getMemoryVT() == MVT::i64 && - AN->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS; - }]>; -} - -defm atomic_cmp_swap : AtomicCmpSwapLocal ; - -def flat_load : PatFrag<(ops node:$ptr), (load node:$ptr), [{ - return isFlatLoad(dyn_cast(N)); -}]>; - -def flat_store : PatFrag<(ops node:$val, node:$ptr), - (store node:$val, node:$ptr), [{ - return isFlatStore(dyn_cast(N)); -}]>; - -def mskor_flat : PatFrag<(ops node:$val, node:$ptr), - (AMDGPUstore_mskor node:$val, node:$ptr), [{ - return cast(N)->getAddressSpace() == AMDGPUAS::FLAT_ADDRESS; -}]>; - -class global_binary_atomic_op : PatFrag< - (ops node:$ptr, node:$value), - (atomic_op node:$ptr, node:$value), - [{return cast(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS;}] ->; - -def atomic_swap_global : global_binary_atomic_op; -def atomic_add_global : global_binary_atomic_op; -def atomic_and_global : global_binary_atomic_op; -def atomic_max_global : global_binary_atomic_op; -def atomic_min_global : global_binary_atomic_op; -def atomic_or_global : global_binary_atomic_op; -def atomic_sub_global : global_binary_atomic_op; -def atomic_umax_global : global_binary_atomic_op; -def atomic_umin_global : global_binary_atomic_op; -def atomic_xor_global : global_binary_atomic_op; - -//===----------------------------------------------------------------------===// -// Misc Pattern Fragments -//===----------------------------------------------------------------------===// - -class Constants { -int TWO_PI = 0x40c90fdb; -int PI = 0x40490fdb; -int TWO_PI_INV = 0x3e22f983; -int FP_UINT_MAX_PLUS_1 = 0x4f800000; // 1 << 32 in floating point encoding -int FP32_NEG_ONE = 0xbf800000; -int FP32_ONE = 0x3f800000; -} -def CONST : Constants; - -def FP_ZERO : PatLeaf < - (fpimm), - [{return N->getValueAPF().isZero();}] ->; - -def FP_ONE : PatLeaf < - (fpimm), - [{return N->isExactlyValue(1.0);}] ->; - -def FP_HALF : PatLeaf < - (fpimm), - [{return N->isExactlyValue(0.5);}] ->; - -let isCodeGenOnly = 1, isPseudo = 1 in { - -let usesCustomInserter = 1 in { - -class CLAMP : AMDGPUShaderInst < - (outs rc:$dst), - (ins rc:$src0), - "CLAMP $dst, $src0", - [(set f32:$dst, (AMDGPUclamp f32:$src0, (f32 FP_ZERO), (f32 FP_ONE)))] ->; - -class FABS : AMDGPUShaderInst < - (outs rc:$dst), - (ins rc:$src0), - "FABS $dst, $src0", - [(set f32:$dst, (fabs f32:$src0))] ->; - -class FNEG : AMDGPUShaderInst < - (outs rc:$dst), - (ins rc:$src0), - "FNEG $dst, $src0", - [(set f32:$dst, (fneg f32:$src0))] ->; - -} // usesCustomInserter = 1 - -multiclass RegisterLoadStore { -let UseNamedOperandTable = 1 in { - - def RegisterLoad : AMDGPUShaderInst < - (outs dstClass:$dst), - (ins addrClass:$addr, i32imm:$chan), - "RegisterLoad $dst, $addr", - [(set i32:$dst, (AMDGPUregister_load addrPat:$addr, (i32 timm:$chan)))] - > { - let isRegisterLoad = 1; - } - - def RegisterStore : AMDGPUShaderInst < - (outs), - (ins dstClass:$val, addrClass:$addr, i32imm:$chan), - "RegisterStore $val, $addr", - [(AMDGPUregister_store i32:$val, addrPat:$addr, (i32 timm:$chan))] - > { - let isRegisterStore = 1; - } -} -} - -} // End isCodeGenOnly = 1, isPseudo = 1 - -/* Generic helper patterns for intrinsics */ -/* -------------------------------------- */ - -class POW_Common - : Pat < - (fpow f32:$src0, f32:$src1), - (exp_ieee (mul f32:$src1, (log_ieee f32:$src0))) ->; - -/* Other helper patterns */ -/* --------------------- */ - -/* Extract element pattern */ -class Extract_Element - : Pat< - (sub_type (vector_extract vec_type:$src, sub_idx)), - (EXTRACT_SUBREG $src, sub_reg) ->; - -/* Insert element pattern */ -class Insert_Element - : Pat < - (vector_insert vec_type:$vec, elem_type:$elem, sub_idx), - (INSERT_SUBREG $vec, $elem, sub_reg) ->; - -// XXX: Convert to new syntax and use COPY_TO_REG, once the DFAPacketizer -// can handle COPY instructions. -// bitconvert pattern -class BitConvert : Pat < - (dt (bitconvert (st rc:$src0))), - (dt rc:$src0) ->; - -// XXX: Convert to new syntax and use COPY_TO_REG, once the DFAPacketizer -// can handle COPY instructions. -class DwordAddrPat : Pat < - (vt (AMDGPUdwordaddr (vt rc:$addr))), - (vt rc:$addr) ->; - -// BFI_INT patterns - -multiclass BFIPatterns { - // Definition from ISA doc: - // (y & x) | (z & ~x) - def : Pat < - (or (and i32:$y, i32:$x), (and i32:$z, (not i32:$x))), - (BFI_INT $x, $y, $z) - >; - - // SHA-256 Ch function - // z ^ (x & (y ^ z)) - def : Pat < - (xor i32:$z, (and i32:$x, (xor i32:$y, i32:$z))), - (BFI_INT $x, $y, $z) - >; - - def : Pat < - (fcopysign f32:$src0, f32:$src1), - (BFI_INT (LoadImm32 0x7fffffff), $src0, $src1) - >; - - def : Pat < - (f64 (fcopysign f64:$src0, f64:$src1)), - (REG_SEQUENCE RC64, - (i32 (EXTRACT_SUBREG $src0, sub0)), sub0, - (BFI_INT (LoadImm32 0x7fffffff), - (i32 (EXTRACT_SUBREG $src0, sub1)), - (i32 (EXTRACT_SUBREG $src1, sub1))), sub1) - >; -} - -// SHA-256 Ma patterns - -// ((x & z) | (y & (x | z))) -> BFI_INT (XOR x, y), z, y -class SHA256MaPattern : Pat < - (or (and i32:$x, i32:$z), (and i32:$y, (or i32:$x, i32:$z))), - (BFI_INT (XOR i32:$x, i32:$y), i32:$z, i32:$y) ->; - -// Bitfield extract patterns - -def IMMZeroBasedBitfieldMask : PatLeaf <(imm), [{ - return isMask_32(N->getZExtValue()); -}]>; - -def IMMPopCount : SDNodeXFormgetTargetConstant(countPopulation(N->getZExtValue()), SDLoc(N), - MVT::i32); -}]>; - -class BFEPattern : Pat < - (i32 (and (i32 (srl i32:$src, i32:$rshift)), IMMZeroBasedBitfieldMask:$mask)), - (BFE $src, $rshift, (MOV (i32 (IMMPopCount $mask)))) ->; - -// rotr pattern -class ROTRPattern : Pat < - (rotr i32:$src0, i32:$src1), - (BIT_ALIGN $src0, $src0, $src1) ->; - -// 24-bit arithmetic patterns -def umul24 : PatFrag <(ops node:$x, node:$y), (mul node:$x, node:$y)>; - -// Special conversion patterns - -def cvt_rpi_i32_f32 : PatFrag < - (ops node:$src), - (fp_to_sint (ffloor (fadd $src, FP_HALF))), - [{ (void) N; return TM.Options.NoNaNsFPMath; }] ->; - -def cvt_flr_i32_f32 : PatFrag < - (ops node:$src), - (fp_to_sint (ffloor $src)), - [{ (void)N; return TM.Options.NoNaNsFPMath; }] ->; - -/* -class UMUL24Pattern : Pat < - (mul U24:$x, U24:$y), - (UMUL24 $x, $y) ->; -*/ - -class IMad24Pat : Pat < - (add (AMDGPUmul_i24 i32:$src0, i32:$src1), i32:$src2), - (Inst $src0, $src1, $src2) ->; - -class UMad24Pat : Pat < - (add (AMDGPUmul_u24 i32:$src0, i32:$src1), i32:$src2), - (Inst $src0, $src1, $src2) ->; - -multiclass Expand24IBitOps { - def _expand_imad24 : Pat < - (AMDGPUmad_i24 i32:$src0, i32:$src1, i32:$src2), - (AddInst (MulInst $src0, $src1), $src2) - >; - - def _expand_imul24 : Pat < - (AMDGPUmul_i24 i32:$src0, i32:$src1), - (MulInst $src0, $src1) - >; -} - -multiclass Expand24UBitOps { - def _expand_umad24 : Pat < - (AMDGPUmad_u24 i32:$src0, i32:$src1, i32:$src2), - (AddInst (MulInst $src0, $src1), $src2) - >; - - def _expand_umul24 : Pat < - (AMDGPUmul_u24 i32:$src0, i32:$src1), - (MulInst $src0, $src1) - >; -} - -class RcpPat : Pat < - (fdiv FP_ONE, vt:$src), - (RcpInst $src) ->; - -class RsqPat : Pat < - (AMDGPUrcp (fsqrt vt:$src)), - (RsqInst $src) ->; - -include "R600Instructions.td" -include "R700Instructions.td" -include "EvergreenInstructions.td" -include "CaymanInstructions.td" - -include "SIInstrInfo.td" - Index: projects/clang370-import/contrib/llvm/lib/Target/R600/R600RegisterInfo.h =================================================================== --- projects/clang370-import/contrib/llvm/lib/Target/R600/R600RegisterInfo.h (revision 288077) +++ projects/clang370-import/contrib/llvm/lib/Target/R600/R600RegisterInfo.h (nonexistent) @@ -1,49 +0,0 @@ -//===-- R600RegisterInfo.h - R600 Register Info Interface ------*- C++ -*--===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -/// \file -/// \brief Interface definition for R600RegisterInfo -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIB_TARGET_R600_R600REGISTERINFO_H -#define LLVM_LIB_TARGET_R600_R600REGISTERINFO_H - -#include "AMDGPURegisterInfo.h" - -namespace llvm { - -class AMDGPUSubtarget; - -struct R600RegisterInfo : public AMDGPURegisterInfo { - RegClassWeight RCW; - - R600RegisterInfo(); - - BitVector getReservedRegs(const MachineFunction &MF) const override; - - /// \brief get the HW encoding for a register's channel. - unsigned getHWRegChan(unsigned reg) const; - - unsigned getHWRegIndex(unsigned Reg) const override; - - /// \brief get the register class of the specified type to use in the - /// CFGStructurizer - const TargetRegisterClass * getCFGStructurizerRegClass(MVT VT) const override; - - const RegClassWeight & - getRegClassWeight(const TargetRegisterClass *RC) const override; - - // \returns true if \p Reg can be defined in one ALU caluse and used in another. - bool isPhysRegLiveAcrossClauses(unsigned Reg) const; -}; - -} // End namespace llvm - -#endif Index: projects/clang370-import/contrib/llvm/lib/Target/R600/VIInstructions.td =================================================================== --- projects/clang370-import/contrib/llvm/lib/Target/R600/VIInstructions.td (revision 288077) +++ projects/clang370-import/contrib/llvm/lib/Target/R600/VIInstructions.td (nonexistent) @@ -1,106 +0,0 @@ -//===-- VIInstructions.td - VI Instruction Defintions ---------------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// Instruction definitions for VI and newer. -//===----------------------------------------------------------------------===// - -let SIAssemblerPredicate = DisableInst, SubtargetPredicate = isVI in { - -//===----------------------------------------------------------------------===// -// VOP1 Instructions -//===----------------------------------------------------------------------===// - -defm V_CVT_F16_U16 : VOP1Inst , "v_cvt_f16_u16", VOP_F16_I16>; -defm V_CVT_F16_I16 : VOP1Inst , "v_cvt_f16_i16", VOP_F16_I16>; -defm V_CVT_U16_F16 : VOP1Inst , "v_cvt_u16_f16", VOP_I16_F16>; -defm V_CVT_I16_F16 : VOP1Inst , "v_cvt_i16_f16", VOP_I16_F16>; -defm V_RCP_F16 : VOP1Inst , "v_rcp_f16", VOP_F16_F16>; -defm V_SQRT_F16 : VOP1Inst , "v_sqrt_f16", VOP_F16_F16>; -defm V_RSQ_F16 : VOP1Inst , "v_rsq_f16", VOP_F16_F16>; -defm V_LOG_F16 : VOP1Inst , "v_log_f16", VOP_F16_F16>; -defm V_EXP_F16 : VOP1Inst , "v_exp_f16", VOP_F16_F16>; -defm V_FREXP_MANT_F16 : VOP1Inst , "v_frexp_mant_f16", - VOP_F16_F16 ->; -defm V_FREXP_EXP_I16_F16 : VOP1Inst , "v_frexp_exp_i16_f16", - VOP_I16_F16 ->; -defm V_FLOOR_F16 : VOP1Inst , "v_floor_f16", VOP_F16_F16>; -defm V_CEIL_F16 : VOP1Inst , "v_ceil_f16", VOP_F16_F16>; -defm V_TRUNC_F16 : VOP1Inst , "v_trunc_f16", VOP_F16_F16>; -defm V_RNDNE_F16 : VOP1Inst , "v_rndne_f16", VOP_F16_F16>; -defm V_FRACT_F16 : VOP1Inst , "v_fract_f16", VOP_F16_F16>; -defm V_SIN_F16 : VOP1Inst , "v_sin_f16", VOP_F16_F16>; -defm V_COS_F16 : VOP1Inst , "v_cos_f16", VOP_F16_F16>; - -//===----------------------------------------------------------------------===// -// VOP2 Instructions -//===----------------------------------------------------------------------===// - -let isCommutable = 1 in { - -defm V_ADD_F16 : VOP2Inst , "v_add_f16", VOP_F16_F16_F16>; -defm V_SUB_F16 : VOP2Inst , "v_sub_f16", VOP_F16_F16_F16>; -defm V_SUBREV_F16 : VOP2Inst , "v_subrev_f16", VOP_F16_F16_F16, - null_frag, "v_sub_f16" ->; -defm V_MUL_F16 : VOP2Inst , "v_mul_f16", VOP_F16_F16_F16>; -defm V_MAC_F16 : VOP2Inst , "v_mac_f16", VOP_F16_F16_F16>; -} // End isCommutable = 1 -defm V_MADMK_F16 : VOP2MADK , "v_madmk_f16">; -let isCommutable = 1 in { -defm V_MADAK_F16 : VOP2MADK , "v_madak_f16">; -defm V_ADD_U16 : VOP2Inst , "v_add_u16", VOP_I16_I16_I16>; -defm V_SUB_U16 : VOP2Inst , "v_sub_u16" , VOP_I16_I16_I16>; -defm V_SUBREV_U16 : VOP2Inst , "v_subrev_u16", VOP_I16_I16_I16>; -defm V_MUL_LO_U16 : VOP2Inst , "v_mul_lo_u16", VOP_I16_I16_I16>; -} // End isCommutable = 1 -defm V_LSHLREV_B16 : VOP2Inst , "v_lshlrev_b16", VOP_I16_I16_I16>; -defm V_LSHRREV_B16 : VOP2Inst , "v_lshrrev_b16", VOP_I16_I16_I16>; -defm V_ASHRREV_B16 : VOP2Inst , "v_ashrrev_b16", VOP_I16_I16_I16>; -let isCommutable = 1 in { -defm V_MAX_F16 : VOP2Inst , "v_max_f16", VOP_F16_F16_F16>; -defm V_MIN_F16 : VOP2Inst , "v_min_f16", VOP_F16_F16_F16>; -defm V_MAX_U16 : VOP2Inst , "v_max_u16", VOP_I16_I16_I16>; -defm V_MAX_I16 : VOP2Inst , "v_max_i16", VOP_I16_I16_I16>; -defm V_MIN_U16 : VOP2Inst , "v_min_u16", VOP_I16_I16_I16>; -defm V_MIN_I16 : VOP2Inst , "v_min_i16", VOP_I16_I16_I16>; -} // End isCommutable = 1 -defm V_LDEXP_F16 : VOP2Inst , "v_ldexp_f16", VOP_F16_F16_I16>; - -// Aliases to simplify matching of floating-pint instructions that are VOP2 on -// SI and VOP3 on VI. - -class SI2_VI3Alias : InstAlias < - name#" $dst, $src0, $src1", - (inst VGPR_32:$dst, 0, VCSrc_32:$src0, 0, VCSrc_32:$src1, 0, 0) ->, PredicateControl { - let UseInstAsmMatchConverter = 0; -} - -def : SI2_VI3Alias <"v_ldexp_f32", V_LDEXP_F32_e64_vi>; -def : SI2_VI3Alias <"v_cvt_pkaccum_u8_f32", V_CVT_PKACCUM_U8_F32_e64_vi>; -def : SI2_VI3Alias <"v_cvt_pknorm_i16_f32", V_CVT_PKNORM_I16_F32_e64_vi>; -def : SI2_VI3Alias <"v_cvt_pknorm_u16_f32", V_CVT_PKNORM_U16_F32_e64_vi>; -def : SI2_VI3Alias <"v_cvt_pkrtz_f16_f32", V_CVT_PKRTZ_F16_F32_e64_vi>; - -} // End SIAssemblerPredicate = DisableInst, SubtargetPredicate = isVI - -//===----------------------------------------------------------------------===// -// SMEM Patterns -//===----------------------------------------------------------------------===// - -let Predicates = [isVI] in { - -// 1. Offset as 20bit DWORD immediate -def : Pat < - (SIload_constant v4i32:$sbase, IMM20bit:$offset), - (S_BUFFER_LOAD_DWORD_IMM $sbase, (as_i32imm $offset)) ->; - -} // End Predicates = [isVI] Index: projects/clang370-import/contrib/llvm/lib/Target/R600/AMDGPUFrameLowering.cpp =================================================================== --- projects/clang370-import/contrib/llvm/lib/Target/R600/AMDGPUFrameLowering.cpp (revision 288077) +++ projects/clang370-import/contrib/llvm/lib/Target/R600/AMDGPUFrameLowering.cpp (nonexistent) @@ -1,112 +0,0 @@ -//===----------------------- AMDGPUFrameLowering.cpp ----------------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//==-----------------------------------------------------------------------===// -// -// Interface to describe a layout of a stack frame on a AMDIL target machine -// -//===----------------------------------------------------------------------===// -#include "AMDGPUFrameLowering.h" -#include "AMDGPURegisterInfo.h" -#include "R600MachineFunctionInfo.h" -#include "llvm/CodeGen/MachineFrameInfo.h" -#include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/IR/Instructions.h" - -using namespace llvm; -AMDGPUFrameLowering::AMDGPUFrameLowering(StackDirection D, unsigned StackAl, - int LAO, unsigned TransAl) - : TargetFrameLowering(D, StackAl, LAO, TransAl) { } - -AMDGPUFrameLowering::~AMDGPUFrameLowering() { } - -unsigned AMDGPUFrameLowering::getStackWidth(const MachineFunction &MF) const { - - // XXX: Hardcoding to 1 for now. - // - // I think the StackWidth should stored as metadata associated with the - // MachineFunction. This metadata can either be added by a frontend, or - // calculated by a R600 specific LLVM IR pass. - // - // The StackWidth determines how stack objects are laid out in memory. - // For a vector stack variable, like: int4 stack[2], the data will be stored - // in the following ways depending on the StackWidth. - // - // StackWidth = 1: - // - // T0.X = stack[0].x - // T1.X = stack[0].y - // T2.X = stack[0].z - // T3.X = stack[0].w - // T4.X = stack[1].x - // T5.X = stack[1].y - // T6.X = stack[1].z - // T7.X = stack[1].w - // - // StackWidth = 2: - // - // T0.X = stack[0].x - // T0.Y = stack[0].y - // T1.X = stack[0].z - // T1.Y = stack[0].w - // T2.X = stack[1].x - // T2.Y = stack[1].y - // T3.X = stack[1].z - // T3.Y = stack[1].w - // - // StackWidth = 4: - // T0.X = stack[0].x - // T0.Y = stack[0].y - // T0.Z = stack[0].z - // T0.W = stack[0].w - // T1.X = stack[1].x - // T1.Y = stack[1].y - // T1.Z = stack[1].z - // T1.W = stack[1].w - return 1; -} - -/// \returns The number of registers allocated for \p FI. -int AMDGPUFrameLowering::getFrameIndexOffset(const MachineFunction &MF, - int FI) const { - const MachineFrameInfo *MFI = MF.getFrameInfo(); - // Start the offset at 2 so we don't overwrite work group information. - // XXX: We should only do this when the shader actually uses this - // information. - unsigned OffsetBytes = 2 * (getStackWidth(MF) * 4); - int UpperBound = FI == -1 ? MFI->getNumObjects() : FI; - - for (int i = MFI->getObjectIndexBegin(); i < UpperBound; ++i) { - OffsetBytes = RoundUpToAlignment(OffsetBytes, MFI->getObjectAlignment(i)); - OffsetBytes += MFI->getObjectSize(i); - // Each register holds 4 bytes, so we must always align the offset to at - // least 4 bytes, so that 2 frame objects won't share the same register. - OffsetBytes = RoundUpToAlignment(OffsetBytes, 4); - } - - if (FI != -1) - OffsetBytes = RoundUpToAlignment(OffsetBytes, MFI->getObjectAlignment(FI)); - - return OffsetBytes / (getStackWidth(MF) * 4); -} - -const TargetFrameLowering::SpillSlot * -AMDGPUFrameLowering::getCalleeSavedSpillSlots(unsigned &NumEntries) const { - NumEntries = 0; - return nullptr; -} -void AMDGPUFrameLowering::emitPrologue(MachineFunction &MF, - MachineBasicBlock &MBB) const {} -void -AMDGPUFrameLowering::emitEpilogue(MachineFunction &MF, - MachineBasicBlock &MBB) const { -} - -bool -AMDGPUFrameLowering::hasFP(const MachineFunction &MF) const { - return false; -} Index: projects/clang370-import/contrib/llvm/lib/Target/R600/SIIntrinsics.td =================================================================== --- projects/clang370-import/contrib/llvm/lib/Target/R600/SIIntrinsics.td (revision 288077) +++ projects/clang370-import/contrib/llvm/lib/Target/R600/SIIntrinsics.td (nonexistent) @@ -1,199 +0,0 @@ -//===-- SIIntrinsics.td - SI Intrinsic defs ----------------*- tablegen -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// SI Intrinsic Definitions -// -//===----------------------------------------------------------------------===// - - -let TargetPrefix = "SI", isTarget = 1 in { - - def int_SI_tid : Intrinsic <[llvm_i32_ty], [], [IntrNoMem]>; - def int_SI_packf16 : Intrinsic <[llvm_i32_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>; - def int_SI_export : Intrinsic <[], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], []>; - def int_SI_load_const : Intrinsic <[llvm_float_ty], [llvm_anyint_ty, llvm_i32_ty], [IntrNoMem]>; - def int_SI_vs_load_input : Intrinsic <[llvm_v4f32_ty], [llvm_anyint_ty, llvm_i16_ty, llvm_i32_ty], [IntrNoMem]> ; - - // Fully-flexible TBUFFER_STORE_FORMAT_* except for the ADDR64 bit, which is not exposed - def int_SI_tbuffer_store : Intrinsic < - [], - [llvm_anyint_ty, // rsrc(SGPR) - llvm_anyint_ty, // vdata(VGPR), overloaded for types i32, v2i32, v4i32 - llvm_i32_ty, // num_channels(imm), selects opcode suffix: 1=X, 2=XY, 3=XYZ, 4=XYZW - llvm_i32_ty, // vaddr(VGPR) - llvm_i32_ty, // soffset(SGPR) - llvm_i32_ty, // inst_offset(imm) - llvm_i32_ty, // dfmt(imm) - llvm_i32_ty, // nfmt(imm) - llvm_i32_ty, // offen(imm) - llvm_i32_ty, // idxen(imm) - llvm_i32_ty, // glc(imm) - llvm_i32_ty, // slc(imm) - llvm_i32_ty], // tfe(imm) - []>; - - // Fully-flexible BUFFER_LOAD_DWORD_* except for the ADDR64 bit, which is not exposed - def int_SI_buffer_load_dword : Intrinsic < - [llvm_anyint_ty], // vdata(VGPR), overloaded for types i32, v2i32, v4i32 - [llvm_anyint_ty, // rsrc(SGPR) - llvm_anyint_ty, // vaddr(VGPR) - llvm_i32_ty, // soffset(SGPR) - llvm_i32_ty, // inst_offset(imm) - llvm_i32_ty, // offen(imm) - llvm_i32_ty, // idxen(imm) - llvm_i32_ty, // glc(imm) - llvm_i32_ty, // slc(imm) - llvm_i32_ty], // tfe(imm) - [IntrReadArgMem]>; - - def int_SI_sendmsg : Intrinsic <[], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; - - // Fully-flexible SAMPLE instruction. - class SampleRaw : Intrinsic < - [llvm_v4f32_ty], // vdata(VGPR) - [llvm_anyint_ty, // vaddr(VGPR) - llvm_v8i32_ty, // rsrc(SGPR) - llvm_v4i32_ty, // sampler(SGPR) - llvm_i32_ty, // dmask(imm) - llvm_i32_ty, // unorm(imm) - llvm_i32_ty, // r128(imm) - llvm_i32_ty, // da(imm) - llvm_i32_ty, // glc(imm) - llvm_i32_ty, // slc(imm) - llvm_i32_ty, // tfe(imm) - llvm_i32_ty], // lwe(imm) - [IntrNoMem]>; - - // Image instruction without a sampler. - class Image : Intrinsic < - [llvm_v4f32_ty], // vdata(VGPR) - [llvm_anyint_ty, // vaddr(VGPR) - llvm_v8i32_ty, // rsrc(SGPR) - llvm_i32_ty, // dmask(imm) - llvm_i32_ty, // unorm(imm) - llvm_i32_ty, // r128(imm) - llvm_i32_ty, // da(imm) - llvm_i32_ty, // glc(imm) - llvm_i32_ty, // slc(imm) - llvm_i32_ty, // tfe(imm) - llvm_i32_ty], // lwe(imm) - [IntrNoMem]>; - - // Basic sample - def int_SI_image_sample : SampleRaw; - def int_SI_image_sample_cl : SampleRaw; - def int_SI_image_sample_d : SampleRaw; - def int_SI_image_sample_d_cl : SampleRaw; - def int_SI_image_sample_l : SampleRaw; - def int_SI_image_sample_b : SampleRaw; - def int_SI_image_sample_b_cl : SampleRaw; - def int_SI_image_sample_lz : SampleRaw; - def int_SI_image_sample_cd : SampleRaw; - def int_SI_image_sample_cd_cl : SampleRaw; - - // Sample with comparison - def int_SI_image_sample_c : SampleRaw; - def int_SI_image_sample_c_cl : SampleRaw; - def int_SI_image_sample_c_d : SampleRaw; - def int_SI_image_sample_c_d_cl : SampleRaw; - def int_SI_image_sample_c_l : SampleRaw; - def int_SI_image_sample_c_b : SampleRaw; - def int_SI_image_sample_c_b_cl : SampleRaw; - def int_SI_image_sample_c_lz : SampleRaw; - def int_SI_image_sample_c_cd : SampleRaw; - def int_SI_image_sample_c_cd_cl : SampleRaw; - - // Sample with offsets - def int_SI_image_sample_o : SampleRaw; - def int_SI_image_sample_cl_o : SampleRaw; - def int_SI_image_sample_d_o : SampleRaw; - def int_SI_image_sample_d_cl_o : SampleRaw; - def int_SI_image_sample_l_o : SampleRaw; - def int_SI_image_sample_b_o : SampleRaw; - def int_SI_image_sample_b_cl_o : SampleRaw; - def int_SI_image_sample_lz_o : SampleRaw; - def int_SI_image_sample_cd_o : SampleRaw; - def int_SI_image_sample_cd_cl_o : SampleRaw; - - // Sample with comparison and offsets - def int_SI_image_sample_c_o : SampleRaw; - def int_SI_image_sample_c_cl_o : SampleRaw; - def int_SI_image_sample_c_d_o : SampleRaw; - def int_SI_image_sample_c_d_cl_o : SampleRaw; - def int_SI_image_sample_c_l_o : SampleRaw; - def int_SI_image_sample_c_b_o : SampleRaw; - def int_SI_image_sample_c_b_cl_o : SampleRaw; - def int_SI_image_sample_c_lz_o : SampleRaw; - def int_SI_image_sample_c_cd_o : SampleRaw; - def int_SI_image_sample_c_cd_cl_o : SampleRaw; - - // Basic gather4 - def int_SI_gather4 : SampleRaw; - def int_SI_gather4_cl : SampleRaw; - def int_SI_gather4_l : SampleRaw; - def int_SI_gather4_b : SampleRaw; - def int_SI_gather4_b_cl : SampleRaw; - def int_SI_gather4_lz : SampleRaw; - - // Gather4 with comparison - def int_SI_gather4_c : SampleRaw; - def int_SI_gather4_c_cl : SampleRaw; - def int_SI_gather4_c_l : SampleRaw; - def int_SI_gather4_c_b : SampleRaw; - def int_SI_gather4_c_b_cl : SampleRaw; - def int_SI_gather4_c_lz : SampleRaw; - - // Gather4 with offsets - def int_SI_gather4_o : SampleRaw; - def int_SI_gather4_cl_o : SampleRaw; - def int_SI_gather4_l_o : SampleRaw; - def int_SI_gather4_b_o : SampleRaw; - def int_SI_gather4_b_cl_o : SampleRaw; - def int_SI_gather4_lz_o : SampleRaw; - - // Gather4 with comparison and offsets - def int_SI_gather4_c_o : SampleRaw; - def int_SI_gather4_c_cl_o : SampleRaw; - def int_SI_gather4_c_l_o : SampleRaw; - def int_SI_gather4_c_b_o : SampleRaw; - def int_SI_gather4_c_b_cl_o : SampleRaw; - def int_SI_gather4_c_lz_o : SampleRaw; - - def int_SI_getlod : SampleRaw; - - // Image instrinsics. - def int_SI_image_load : Image; - def int_SI_image_load_mip : Image; - def int_SI_getresinfo : Image; - - // Deprecated image and sample intrinsics. - class Sample : Intrinsic <[llvm_v4f32_ty], [llvm_anyvector_ty, llvm_v32i8_ty, llvm_anyint_ty, llvm_i32_ty], [IntrNoMem]>; - - def int_SI_sample : Sample; - def int_SI_sampleb : Sample; - def int_SI_sampled : Sample; - def int_SI_samplel : Sample; - def int_SI_imageload : Intrinsic <[llvm_v4i32_ty], [llvm_anyvector_ty, llvm_v32i8_ty, llvm_i32_ty], [IntrNoMem]>; - def int_SI_resinfo : Intrinsic <[llvm_v4i32_ty], [llvm_i32_ty, llvm_v32i8_ty, llvm_i32_ty], [IntrNoMem]>; - - /* Interpolation Intrinsics */ - - def int_SI_fs_constant : Intrinsic <[llvm_float_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; - def int_SI_fs_interp : Intrinsic <[llvm_float_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_v2i32_ty], [IntrNoMem]>; - - /* Control flow Intrinsics */ - - def int_SI_if : Intrinsic<[llvm_i64_ty], [llvm_i1_ty, llvm_empty_ty], []>; - def int_SI_else : Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_empty_ty], []>; - def int_SI_break : Intrinsic<[llvm_i64_ty], [llvm_i64_ty], []>; - def int_SI_if_break : Intrinsic<[llvm_i64_ty], [llvm_i1_ty, llvm_i64_ty], []>; - def int_SI_else_break : Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty], []>; - def int_SI_loop : Intrinsic<[], [llvm_i64_ty, llvm_empty_ty], []>; - def int_SI_end_cf : Intrinsic<[], [llvm_i64_ty], []>; -} Index: projects/clang370-import/contrib/llvm/lib/Target/R600/AMDGPUMCInstLower.h =================================================================== --- projects/clang370-import/contrib/llvm/lib/Target/R600/AMDGPUMCInstLower.h (revision 288077) +++ projects/clang370-import/contrib/llvm/lib/Target/R600/AMDGPUMCInstLower.h (nonexistent) @@ -1,35 +0,0 @@ -//===- AMDGPUMCInstLower.h MachineInstr Lowering Interface ------*- C++ -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -/// \file -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIB_TARGET_R600_AMDGPUMCINSTLOWER_H -#define LLVM_LIB_TARGET_R600_AMDGPUMCINSTLOWER_H - -namespace llvm { - -class AMDGPUSubtarget; -class MachineInstr; -class MCContext; -class MCInst; - -class AMDGPUMCInstLower { - MCContext &Ctx; - const AMDGPUSubtarget &ST; - -public: - AMDGPUMCInstLower(MCContext &ctx, const AMDGPUSubtarget &ST); - - /// \brief Lower a MachineInstr to an MCInst - void lower(const MachineInstr *MI, MCInst &OutMI) const; - -}; - -} // End namespace llvm - -#endif Index: projects/clang370-import/contrib/llvm/lib/Target/R600/AMDGPU.td =================================================================== --- projects/clang370-import/contrib/llvm/lib/Target/R600/AMDGPU.td (revision 288077) +++ projects/clang370-import/contrib/llvm/lib/Target/R600/AMDGPU.td (nonexistent) @@ -1,266 +0,0 @@ -//===-- AMDGPU.td - AMDGPU Tablegen files ------------------*- tablegen -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// - -include "llvm/Target/Target.td" - -//===----------------------------------------------------------------------===// -// Subtarget Features -//===----------------------------------------------------------------------===// - -// Debugging Features - -def FeatureDumpCode : SubtargetFeature <"DumpCode", - "DumpCode", - "true", - "Dump MachineInstrs in the CodeEmitter">; - -def FeatureDumpCodeLower : SubtargetFeature <"dumpcode", - "DumpCode", - "true", - "Dump MachineInstrs in the CodeEmitter">; - -def FeatureIRStructurizer : SubtargetFeature <"disable-irstructurizer", - "EnableIRStructurizer", - "false", - "Disable IR Structurizer">; - -def FeaturePromoteAlloca : SubtargetFeature <"promote-alloca", - "EnablePromoteAlloca", - "true", - "Enable promote alloca pass">; - -// Target features - -def FeatureIfCvt : SubtargetFeature <"disable-ifcvt", - "EnableIfCvt", - "false", - "Disable the if conversion pass">; - -def FeatureFP64 : SubtargetFeature<"fp64", - "FP64", - "true", - "Enable double precision operations">; - -def FeatureFP64Denormals : SubtargetFeature<"fp64-denormals", - "FP64Denormals", - "true", - "Enable double precision denormal handling", - [FeatureFP64]>; - -def FeatureFastFMAF32 : SubtargetFeature<"fast-fmaf", - "FastFMAF32", - "true", - "Assuming f32 fma is at least as fast as mul + add", - []>; - -// Some instructions do not support denormals despite this flag. Using -// fp32 denormals also causes instructions to run at the double -// precision rate for the device. -def FeatureFP32Denormals : SubtargetFeature<"fp32-denormals", - "FP32Denormals", - "true", - "Enable single precision denormal handling">; - -def Feature64BitPtr : SubtargetFeature<"64BitPtr", - "Is64bit", - "true", - "Specify if 64-bit addressing should be used">; - -def FeatureR600ALUInst : SubtargetFeature<"R600ALUInst", - "R600ALUInst", - "false", - "Older version of ALU instructions encoding">; - -def FeatureVertexCache : SubtargetFeature<"HasVertexCache", - "HasVertexCache", - "true", - "Specify use of dedicated vertex cache">; - -def FeatureCaymanISA : SubtargetFeature<"caymanISA", - "CaymanISA", - "true", - "Use Cayman ISA">; - -def FeatureCFALUBug : SubtargetFeature<"cfalubug", - "CFALUBug", - "true", - "GPU has CF_ALU bug">; - -// XXX - This should probably be removed once enabled by default -def FeatureEnableLoadStoreOpt : SubtargetFeature <"load-store-opt", - "EnableLoadStoreOpt", - "true", - "Enable SI load/store optimizer pass">; - -def FeatureFlatAddressSpace : SubtargetFeature<"flat-address-space", - "FlatAddressSpace", - "true", - "Support flat address space">; - -def FeatureVGPRSpilling : SubtargetFeature<"vgpr-spilling", - "EnableVGPRSpilling", - "true", - "Enable spilling of VGPRs to scratch memory">; - -def FeatureSGPRInitBug : SubtargetFeature<"sgpr-init-bug", - "SGPRInitBug", - "true", - "VI SGPR initilization bug requiring a fixed SGPR allocation size">; - -class SubtargetFeatureFetchLimit : - SubtargetFeature <"fetch"#Value, - "TexVTXClauseSize", - Value, - "Limit the maximum number of fetches in a clause to "#Value>; - -def FeatureFetchLimit8 : SubtargetFeatureFetchLimit <"8">; -def FeatureFetchLimit16 : SubtargetFeatureFetchLimit <"16">; - -class SubtargetFeatureWavefrontSize : SubtargetFeature< - "wavefrontsize"#Value, - "WavefrontSize", - !cast(Value), - "The number of threads per wavefront">; - -def FeatureWavefrontSize16 : SubtargetFeatureWavefrontSize<16>; -def FeatureWavefrontSize32 : SubtargetFeatureWavefrontSize<32>; -def FeatureWavefrontSize64 : SubtargetFeatureWavefrontSize<64>; - -class SubtargetFeatureLDSBankCount : SubtargetFeature < - "ldsbankcount"#Value, - "LDSBankCount", - !cast(Value), - "The number of LDS banks per compute unit.">; - -def FeatureLDSBankCount16 : SubtargetFeatureLDSBankCount<16>; -def FeatureLDSBankCount32 : SubtargetFeatureLDSBankCount<32>; - -class SubtargetFeatureLocalMemorySize : SubtargetFeature< - "localmemorysize"#Value, - "LocalMemorySize", - !cast(Value), - "The size of local memory in bytes">; - -def FeatureGCN : SubtargetFeature<"gcn", - "IsGCN", - "true", - "GCN or newer GPU">; - -def FeatureGCN1Encoding : SubtargetFeature<"gcn1-encoding", - "GCN1Encoding", - "true", - "Encoding format for SI and CI">; - -def FeatureGCN3Encoding : SubtargetFeature<"gcn3-encoding", - "GCN3Encoding", - "true", - "Encoding format for VI">; - -def FeatureCIInsts : SubtargetFeature<"ci-insts", - "CIInsts", - "true", - "Additional intstructions for CI+">; - -// Dummy feature used to disable assembler instructions. -def FeatureDisable : SubtargetFeature<"", - "FeatureDisable","true", - "Dummy feature to disable assembler" - " instructions">; - -class SubtargetFeatureGeneration Implies> : - SubtargetFeature ; - -def FeatureLocalMemorySize0 : SubtargetFeatureLocalMemorySize<0>; -def FeatureLocalMemorySize32768 : SubtargetFeatureLocalMemorySize<32768>; -def FeatureLocalMemorySize65536 : SubtargetFeatureLocalMemorySize<65536>; - -def FeatureR600 : SubtargetFeatureGeneration<"R600", - [FeatureR600ALUInst, FeatureFetchLimit8, FeatureLocalMemorySize0]>; - -def FeatureR700 : SubtargetFeatureGeneration<"R700", - [FeatureFetchLimit16, FeatureLocalMemorySize0]>; - -def FeatureEvergreen : SubtargetFeatureGeneration<"EVERGREEN", - [FeatureFetchLimit16, FeatureLocalMemorySize32768]>; - -def FeatureNorthernIslands : SubtargetFeatureGeneration<"NORTHERN_ISLANDS", - [FeatureFetchLimit16, FeatureWavefrontSize64, - FeatureLocalMemorySize32768] ->; - -def FeatureSouthernIslands : SubtargetFeatureGeneration<"SOUTHERN_ISLANDS", - [Feature64BitPtr, FeatureFP64, FeatureLocalMemorySize32768, - FeatureWavefrontSize64, FeatureGCN, FeatureGCN1Encoding, - FeatureLDSBankCount32]>; - -def FeatureSeaIslands : SubtargetFeatureGeneration<"SEA_ISLANDS", - [Feature64BitPtr, FeatureFP64, FeatureLocalMemorySize65536, - FeatureWavefrontSize64, FeatureGCN, FeatureFlatAddressSpace, - FeatureGCN1Encoding, FeatureCIInsts]>; - -def FeatureVolcanicIslands : SubtargetFeatureGeneration<"VOLCANIC_ISLANDS", - [Feature64BitPtr, FeatureFP64, FeatureLocalMemorySize65536, - FeatureWavefrontSize64, FeatureFlatAddressSpace, FeatureGCN, - FeatureGCN3Encoding, FeatureCIInsts, FeatureLDSBankCount32]>; - -//===----------------------------------------------------------------------===// - -def AMDGPUInstrInfo : InstrInfo { - let guessInstructionProperties = 1; - let noNamedPositionallyEncodedOperands = 1; -} - -def AMDGPUAsmParser : AsmParser { - // Some of the R600 registers have the same name, so this crashes. - // For example T0_XYZW and T0_XY both have the asm name T0. - let ShouldEmitMatchRegisterName = 0; -} - -def AMDGPU : Target { - // Pull in Instruction Info: - let InstructionSet = AMDGPUInstrInfo; - let AssemblyParsers = [AMDGPUAsmParser]; -} - -// Dummy Instruction itineraries for pseudo instructions -def ALU_NULL : FuncUnit; -def NullALU : InstrItinClass; - -//===----------------------------------------------------------------------===// -// Predicate helper class -//===----------------------------------------------------------------------===// - -def TruePredicate : Predicate<"true">; -def isSICI : Predicate< - "Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS ||" - "Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS" ->, AssemblerPredicate<"FeatureGCN1Encoding">; - -class PredicateControl { - Predicate SubtargetPredicate; - Predicate SIAssemblerPredicate = isSICI; - list AssemblerPredicates = []; - Predicate AssemblerPredicate = TruePredicate; - list OtherPredicates = []; - list Predicates = !listconcat([SubtargetPredicate, AssemblerPredicate], - AssemblerPredicates, - OtherPredicates); -} - -// Include AMDGPU TD files -include "R600Schedule.td" -include "SISchedule.td" -include "Processors.td" -include "AMDGPUInstrInfo.td" -include "AMDGPUIntrinsics.td" -include "AMDGPURegisterInfo.td" -include "AMDGPUInstructions.td" -include "AMDGPUCallingConv.td" Index: projects/clang370-import/contrib/llvm/lib/Target/R600/R600Defines.h =================================================================== --- projects/clang370-import/contrib/llvm/lib/Target/R600/R600Defines.h (revision 288077) +++ projects/clang370-import/contrib/llvm/lib/Target/R600/R600Defines.h (nonexistent) @@ -1,171 +0,0 @@ -//===-- R600Defines.h - R600 Helper Macros ----------------------*- C++ -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -/// \file -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIB_TARGET_R600_R600DEFINES_H -#define LLVM_LIB_TARGET_R600_R600DEFINES_H - -#include "llvm/MC/MCRegisterInfo.h" - -// Operand Flags -#define MO_FLAG_CLAMP (1 << 0) -#define MO_FLAG_NEG (1 << 1) -#define MO_FLAG_ABS (1 << 2) -#define MO_FLAG_MASK (1 << 3) -#define MO_FLAG_PUSH (1 << 4) -#define MO_FLAG_NOT_LAST (1 << 5) -#define MO_FLAG_LAST (1 << 6) -#define NUM_MO_FLAGS 7 - -/// \brief Helper for getting the operand index for the instruction flags -/// operand. -#define GET_FLAG_OPERAND_IDX(Flags) (((Flags) >> 7) & 0x3) - -namespace R600_InstFlag { - enum TIF { - TRANS_ONLY = (1 << 0), - TEX = (1 << 1), - REDUCTION = (1 << 2), - FC = (1 << 3), - TRIG = (1 << 4), - OP3 = (1 << 5), - VECTOR = (1 << 6), - //FlagOperand bits 7, 8 - NATIVE_OPERANDS = (1 << 9), - OP1 = (1 << 10), - OP2 = (1 << 11), - VTX_INST = (1 << 12), - TEX_INST = (1 << 13), - ALU_INST = (1 << 14), - LDS_1A = (1 << 15), - LDS_1A1D = (1 << 16), - IS_EXPORT = (1 << 17), - LDS_1A2D = (1 << 18) - }; -} - -#define HAS_NATIVE_OPERANDS(Flags) ((Flags) & R600_InstFlag::NATIVE_OPERANDS) - -/// \brief Defines for extracting register information from register encoding -#define HW_REG_MASK 0x1ff -#define HW_CHAN_SHIFT 9 - -#define GET_REG_CHAN(reg) ((reg) >> HW_CHAN_SHIFT) -#define GET_REG_INDEX(reg) ((reg) & HW_REG_MASK) - -#define IS_VTX(desc) ((desc).TSFlags & R600_InstFlag::VTX_INST) -#define IS_TEX(desc) ((desc).TSFlags & R600_InstFlag::TEX_INST) - -namespace OpName { - - enum VecOps { - UPDATE_EXEC_MASK_X, - UPDATE_PREDICATE_X, - WRITE_X, - OMOD_X, - DST_REL_X, - CLAMP_X, - SRC0_X, - SRC0_NEG_X, - SRC0_REL_X, - SRC0_ABS_X, - SRC0_SEL_X, - SRC1_X, - SRC1_NEG_X, - SRC1_REL_X, - SRC1_ABS_X, - SRC1_SEL_X, - PRED_SEL_X, - UPDATE_EXEC_MASK_Y, - UPDATE_PREDICATE_Y, - WRITE_Y, - OMOD_Y, - DST_REL_Y, - CLAMP_Y, - SRC0_Y, - SRC0_NEG_Y, - SRC0_REL_Y, - SRC0_ABS_Y, - SRC0_SEL_Y, - SRC1_Y, - SRC1_NEG_Y, - SRC1_REL_Y, - SRC1_ABS_Y, - SRC1_SEL_Y, - PRED_SEL_Y, - UPDATE_EXEC_MASK_Z, - UPDATE_PREDICATE_Z, - WRITE_Z, - OMOD_Z, - DST_REL_Z, - CLAMP_Z, - SRC0_Z, - SRC0_NEG_Z, - SRC0_REL_Z, - SRC0_ABS_Z, - SRC0_SEL_Z, - SRC1_Z, - SRC1_NEG_Z, - SRC1_REL_Z, - SRC1_ABS_Z, - SRC1_SEL_Z, - PRED_SEL_Z, - UPDATE_EXEC_MASK_W, - UPDATE_PREDICATE_W, - WRITE_W, - OMOD_W, - DST_REL_W, - CLAMP_W, - SRC0_W, - SRC0_NEG_W, - SRC0_REL_W, - SRC0_ABS_W, - SRC0_SEL_W, - SRC1_W, - SRC1_NEG_W, - SRC1_REL_W, - SRC1_ABS_W, - SRC1_SEL_W, - PRED_SEL_W, - IMM_0, - IMM_1, - VEC_COUNT - }; - -} - -//===----------------------------------------------------------------------===// -// Config register definitions -//===----------------------------------------------------------------------===// - -#define R_02880C_DB_SHADER_CONTROL 0x02880C -#define S_02880C_KILL_ENABLE(x) (((x) & 0x1) << 6) - -// These fields are the same for all shader types and families. -#define S_NUM_GPRS(x) (((x) & 0xFF) << 0) -#define S_STACK_SIZE(x) (((x) & 0xFF) << 8) -//===----------------------------------------------------------------------===// -// R600, R700 Registers -//===----------------------------------------------------------------------===// - -#define R_028850_SQ_PGM_RESOURCES_PS 0x028850 -#define R_028868_SQ_PGM_RESOURCES_VS 0x028868 - -//===----------------------------------------------------------------------===// -// Evergreen, Northern Islands Registers -//===----------------------------------------------------------------------===// - -#define R_028844_SQ_PGM_RESOURCES_PS 0x028844 -#define R_028860_SQ_PGM_RESOURCES_VS 0x028860 -#define R_028878_SQ_PGM_RESOURCES_GS 0x028878 -#define R_0288D4_SQ_PGM_RESOURCES_LS 0x0288d4 - -#define R_0288E8_SQ_LDS_ALLOC 0x0288E8 - -#endif Index: projects/clang370-import/contrib/llvm/lib/Target/R600/CIInstructions.td =================================================================== --- projects/clang370-import/contrib/llvm/lib/Target/R600/CIInstructions.td (revision 288077) +++ projects/clang370-import/contrib/llvm/lib/Target/R600/CIInstructions.td (nonexistent) @@ -1,42 +0,0 @@ -//===-- CIInstructions.td - CI Instruction Defintions ---------------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// Instruction definitions for CI and newer. -//===----------------------------------------------------------------------===// - - -def isCIVI : Predicate < - "Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS || " - "Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS" ->, AssemblerPredicate<"FeatureCIInsts">; - -//===----------------------------------------------------------------------===// -// VOP1 Instructions -//===----------------------------------------------------------------------===// - -let SubtargetPredicate = isCIVI in { - -defm V_TRUNC_F64 : VOP1Inst , "v_trunc_f64", - VOP_F64_F64, ftrunc ->; -defm V_CEIL_F64 : VOP1Inst , "v_ceil_f64", - VOP_F64_F64, fceil ->; -defm V_FLOOR_F64 : VOP1Inst , "v_floor_f64", - VOP_F64_F64, ffloor ->; -defm V_RNDNE_F64 : VOP1Inst , "v_rndne_f64", - VOP_F64_F64, frint ->; -defm V_LOG_LEGACY_F32 : VOP1Inst , "v_log_legacy_f32", - VOP_F32_F32 ->; -defm V_EXP_LEGACY_F32 : VOP1Inst , "v_exp_legacy_f32", - VOP_F32_F32 ->; -} // End SubtargetPredicate = isCIVI Index: projects/clang370-import/contrib/llvm/lib/Target/R600/R600RegisterInfo.cpp =================================================================== --- projects/clang370-import/contrib/llvm/lib/Target/R600/R600RegisterInfo.cpp (revision 288077) +++ projects/clang370-import/contrib/llvm/lib/Target/R600/R600RegisterInfo.cpp (nonexistent) @@ -1,91 +0,0 @@ -//===-- R600RegisterInfo.cpp - R600 Register Information ------------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -/// \file -/// \brief R600 implementation of the TargetRegisterInfo class. -// -//===----------------------------------------------------------------------===// - -#include "R600RegisterInfo.h" -#include "AMDGPUTargetMachine.h" -#include "R600Defines.h" -#include "R600InstrInfo.h" -#include "R600MachineFunctionInfo.h" - -using namespace llvm; - -R600RegisterInfo::R600RegisterInfo() : AMDGPURegisterInfo() { - RCW.RegWeight = 0; - RCW.WeightLimit = 0; -} - -BitVector R600RegisterInfo::getReservedRegs(const MachineFunction &MF) const { - BitVector Reserved(getNumRegs()); - - const R600InstrInfo *TII = - static_cast(MF.getSubtarget().getInstrInfo()); - - Reserved.set(AMDGPU::ZERO); - Reserved.set(AMDGPU::HALF); - Reserved.set(AMDGPU::ONE); - Reserved.set(AMDGPU::ONE_INT); - Reserved.set(AMDGPU::NEG_HALF); - Reserved.set(AMDGPU::NEG_ONE); - Reserved.set(AMDGPU::PV_X); - Reserved.set(AMDGPU::ALU_LITERAL_X); - Reserved.set(AMDGPU::ALU_CONST); - Reserved.set(AMDGPU::PREDICATE_BIT); - Reserved.set(AMDGPU::PRED_SEL_OFF); - Reserved.set(AMDGPU::PRED_SEL_ZERO); - Reserved.set(AMDGPU::PRED_SEL_ONE); - Reserved.set(AMDGPU::INDIRECT_BASE_ADDR); - - for (TargetRegisterClass::iterator I = AMDGPU::R600_AddrRegClass.begin(), - E = AMDGPU::R600_AddrRegClass.end(); I != E; ++I) { - Reserved.set(*I); - } - - TII->reserveIndirectRegisters(Reserved, MF); - - return Reserved; -} - -unsigned R600RegisterInfo::getHWRegChan(unsigned reg) const { - return this->getEncodingValue(reg) >> HW_CHAN_SHIFT; -} - -unsigned R600RegisterInfo::getHWRegIndex(unsigned Reg) const { - return GET_REG_INDEX(getEncodingValue(Reg)); -} - -const TargetRegisterClass * R600RegisterInfo::getCFGStructurizerRegClass( - MVT VT) const { - switch(VT.SimpleTy) { - default: - case MVT::i32: return &AMDGPU::R600_TReg32RegClass; - } -} - -const RegClassWeight &R600RegisterInfo::getRegClassWeight( - const TargetRegisterClass *RC) const { - return RCW; -} - -bool R600RegisterInfo::isPhysRegLiveAcrossClauses(unsigned Reg) const { - assert(!TargetRegisterInfo::isVirtualRegister(Reg)); - - switch (Reg) { - case AMDGPU::OQAP: - case AMDGPU::OQBP: - case AMDGPU::AR_X: - return false; - default: - return true; - } -} Index: projects/clang370-import/contrib/llvm/lib/Target/R600/AMDGPUTargetTransformInfo.h =================================================================== --- projects/clang370-import/contrib/llvm/lib/Target/R600/AMDGPUTargetTransformInfo.h (revision 288077) +++ projects/clang370-import/contrib/llvm/lib/Target/R600/AMDGPUTargetTransformInfo.h (nonexistent) @@ -1,78 +0,0 @@ -//===-- AMDGPUTargetTransformInfo.h - AMDGPU specific TTI -------*- C++ -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -/// \file -/// This file a TargetTransformInfo::Concept conforming object specific to the -/// AMDGPU target machine. It uses the target's detailed information to -/// provide more precise answers to certain TTI queries, while letting the -/// target independent and default TTI implementations handle the rest. -/// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIB_TARGET_R600_AMDGPUTARGETTRANSFORMINFO_H -#define LLVM_LIB_TARGET_R600_AMDGPUTARGETTRANSFORMINFO_H - -#include "AMDGPU.h" -#include "AMDGPUTargetMachine.h" -#include "llvm/Analysis/TargetTransformInfo.h" -#include "llvm/CodeGen/BasicTTIImpl.h" -#include "llvm/Target/TargetLowering.h" - -namespace llvm { - -class AMDGPUTTIImpl : public BasicTTIImplBase { - typedef BasicTTIImplBase BaseT; - typedef TargetTransformInfo TTI; - friend BaseT; - - const AMDGPUSubtarget *ST; - const AMDGPUTargetLowering *TLI; - - const AMDGPUSubtarget *getST() const { return ST; } - const AMDGPUTargetLowering *getTLI() const { return TLI; } - -public: - explicit AMDGPUTTIImpl(const AMDGPUTargetMachine *TM) - : BaseT(TM), ST(TM->getSubtargetImpl()), TLI(ST->getTargetLowering()) {} - - // Provide value semantics. MSVC requires that we spell all of these out. - AMDGPUTTIImpl(const AMDGPUTTIImpl &Arg) - : BaseT(static_cast(Arg)), ST(Arg.ST), TLI(Arg.TLI) {} - AMDGPUTTIImpl(AMDGPUTTIImpl &&Arg) - : BaseT(std::move(static_cast(Arg))), ST(std::move(Arg.ST)), - TLI(std::move(Arg.TLI)) {} - AMDGPUTTIImpl &operator=(const AMDGPUTTIImpl &RHS) { - BaseT::operator=(static_cast(RHS)); - ST = RHS.ST; - TLI = RHS.TLI; - return *this; - } - AMDGPUTTIImpl &operator=(AMDGPUTTIImpl &&RHS) { - BaseT::operator=(std::move(static_cast(RHS))); - ST = std::move(RHS.ST); - TLI = std::move(RHS.TLI); - return *this; - } - - bool hasBranchDivergence() { return true; } - - void getUnrollingPreferences(Loop *L, TTI::UnrollingPreferences &UP); - - TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth) { - assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2"); - return ST->hasBCNT(TyWidth) ? TTI::PSK_FastHardware : TTI::PSK_Software; - } - - unsigned getNumberOfRegisters(bool Vector); - unsigned getRegisterBitWidth(bool Vector); - unsigned getMaxInterleaveFactor(unsigned VF); -}; - -} // end namespace llvm - -#endif Property changes on: projects/clang370-import/contrib/llvm/lib/Target/R600/AMDGPUTargetTransformInfo.h ___________________________________________________________________ Deleted: svn:eol-style ## -1 +0,0 ## -native \ No newline at end of property Deleted: svn:keywords ## -1 +0,0 ## -FreeBSD=%H \ No newline at end of property Deleted: svn:mime-type ## -1 +0,0 ## -text/plain \ No newline at end of property Index: projects/clang370-import/contrib/llvm/lib/Target/R600/R600InstrInfo.h =================================================================== --- projects/clang370-import/contrib/llvm/lib/Target/R600/R600InstrInfo.h (revision 288077) +++ projects/clang370-import/contrib/llvm/lib/Target/R600/R600InstrInfo.h (nonexistent) @@ -1,301 +0,0 @@ -//===-- R600InstrInfo.h - R600 Instruction Info Interface -------*- C++ -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -/// \file -/// \brief Interface definition for R600InstrInfo -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIB_TARGET_R600_R600INSTRINFO_H -#define LLVM_LIB_TARGET_R600_R600INSTRINFO_H - -#include "AMDGPUInstrInfo.h" -#include "R600Defines.h" -#include "R600RegisterInfo.h" -#include - -namespace llvm { - - class AMDGPUTargetMachine; - class DFAPacketizer; - class ScheduleDAG; - class MachineFunction; - class MachineInstr; - class MachineInstrBuilder; - - class R600InstrInfo : public AMDGPUInstrInfo { - private: - const R600RegisterInfo RI; - - std::vector > - ExtractSrcs(MachineInstr *MI, const DenseMap &PV, unsigned &ConstCount) const; - - - MachineInstrBuilder buildIndirectRead(MachineBasicBlock *MBB, - MachineBasicBlock::iterator I, - unsigned ValueReg, unsigned Address, - unsigned OffsetReg, - unsigned AddrChan) const; - - MachineInstrBuilder buildIndirectWrite(MachineBasicBlock *MBB, - MachineBasicBlock::iterator I, - unsigned ValueReg, unsigned Address, - unsigned OffsetReg, - unsigned AddrChan) const; - public: - enum BankSwizzle { - ALU_VEC_012_SCL_210 = 0, - ALU_VEC_021_SCL_122, - ALU_VEC_120_SCL_212, - ALU_VEC_102_SCL_221, - ALU_VEC_201, - ALU_VEC_210 - }; - - explicit R600InstrInfo(const AMDGPUSubtarget &st); - - const R600RegisterInfo &getRegisterInfo() const override; - void copyPhysReg(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MI, DebugLoc DL, - unsigned DestReg, unsigned SrcReg, - bool KillSrc) const override; - bool isLegalToSplitMBBAt(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MBBI) const override; - - bool isTrig(const MachineInstr &MI) const; - bool isPlaceHolderOpcode(unsigned opcode) const; - bool isReductionOp(unsigned opcode) const; - bool isCubeOp(unsigned opcode) const; - - /// \returns true if this \p Opcode represents an ALU instruction. - bool isALUInstr(unsigned Opcode) const; - bool hasInstrModifiers(unsigned Opcode) const; - bool isLDSInstr(unsigned Opcode) const; - bool isLDSNoRetInstr(unsigned Opcode) const; - bool isLDSRetInstr(unsigned Opcode) const; - - /// \returns true if this \p Opcode represents an ALU instruction or an - /// instruction that will be lowered in ExpandSpecialInstrs Pass. - bool canBeConsideredALU(const MachineInstr *MI) const; - - bool isTransOnly(unsigned Opcode) const; - bool isTransOnly(const MachineInstr *MI) const; - bool isVectorOnly(unsigned Opcode) const; - bool isVectorOnly(const MachineInstr *MI) const; - bool isExport(unsigned Opcode) const; - - bool usesVertexCache(unsigned Opcode) const; - bool usesVertexCache(const MachineInstr *MI) const; - bool usesTextureCache(unsigned Opcode) const; - bool usesTextureCache(const MachineInstr *MI) const; - - bool mustBeLastInClause(unsigned Opcode) const; - bool usesAddressRegister(MachineInstr *MI) const; - bool definesAddressRegister(MachineInstr *MI) const; - bool readsLDSSrcReg(const MachineInstr *MI) const; - - /// \returns The operand index for the given source number. Legal values - /// for SrcNum are 0, 1, and 2. - int getSrcIdx(unsigned Opcode, unsigned SrcNum) const; - /// \returns The operand Index for the Sel operand given an index to one - /// of the instruction's src operands. - int getSelIdx(unsigned Opcode, unsigned SrcIdx) const; - - /// \returns a pair for each src of an ALU instructions. - /// The first member of a pair is the register id. - /// If register is ALU_CONST, second member is SEL. - /// If register is ALU_LITERAL, second member is IMM. - /// Otherwise, second member value is undefined. - SmallVector, 3> - getSrcs(MachineInstr *MI) const; - - unsigned isLegalUpTo( - const std::vector > > &IGSrcs, - const std::vector &Swz, - const std::vector > &TransSrcs, - R600InstrInfo::BankSwizzle TransSwz) const; - - bool FindSwizzleForVectorSlot( - const std::vector > > &IGSrcs, - std::vector &SwzCandidate, - const std::vector > &TransSrcs, - R600InstrInfo::BankSwizzle TransSwz) const; - - /// Given the order VEC_012 < VEC_021 < VEC_120 < VEC_102 < VEC_201 < VEC_210 - /// returns true and the first (in lexical order) BankSwizzle affectation - /// starting from the one already provided in the Instruction Group MIs that - /// fits Read Port limitations in BS if available. Otherwise returns false - /// and undefined content in BS. - /// isLastAluTrans should be set if the last Alu of MIs will be executed on - /// Trans ALU. In this case, ValidTSwizzle returns the BankSwizzle value to - /// apply to the last instruction. - /// PV holds GPR to PV registers in the Instruction Group MIs. - bool fitsReadPortLimitations(const std::vector &MIs, - const DenseMap &PV, - std::vector &BS, - bool isLastAluTrans) const; - - /// An instruction group can only access 2 channel pair (either [XY] or [ZW]) - /// from KCache bank on R700+. This function check if MI set in input meet - /// this limitations - bool fitsConstReadLimitations(const std::vector &) const; - /// Same but using const index set instead of MI set. - bool fitsConstReadLimitations(const std::vector&) const; - - /// \brief Vector instructions are instructions that must fill all - /// instruction slots within an instruction group. - bool isVector(const MachineInstr &MI) const; - - bool isMov(unsigned Opcode) const override; - - DFAPacketizer * - CreateTargetScheduleState(const TargetSubtargetInfo &) const override; - - bool ReverseBranchCondition(SmallVectorImpl &Cond) const override; - - bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, - SmallVectorImpl &Cond, bool AllowModify) const override; - - unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, const SmallVectorImpl &Cond, DebugLoc DL) const override; - - unsigned RemoveBranch(MachineBasicBlock &MBB) const override; - - bool isPredicated(const MachineInstr *MI) const override; - - bool isPredicable(MachineInstr *MI) const override; - - bool - isProfitableToDupForIfCvt(MachineBasicBlock &MBB, unsigned NumCyles, - const BranchProbability &Probability) const override; - - bool isProfitableToIfCvt(MachineBasicBlock &MBB, unsigned NumCyles, - unsigned ExtraPredCycles, - const BranchProbability &Probability) const override ; - - bool - isProfitableToIfCvt(MachineBasicBlock &TMBB, - unsigned NumTCycles, unsigned ExtraTCycles, - MachineBasicBlock &FMBB, - unsigned NumFCycles, unsigned ExtraFCycles, - const BranchProbability &Probability) const override; - - bool DefinesPredicate(MachineInstr *MI, - std::vector &Pred) const override; - - bool SubsumesPredicate(const SmallVectorImpl &Pred1, - const SmallVectorImpl &Pred2) const override; - - bool isProfitableToUnpredicate(MachineBasicBlock &TMBB, - MachineBasicBlock &FMBB) const override; - - bool PredicateInstruction(MachineInstr *MI, - const SmallVectorImpl &Pred) const override; - - unsigned int getPredicationCost(const MachineInstr *) const override; - - unsigned int getInstrLatency(const InstrItineraryData *ItinData, - const MachineInstr *MI, - unsigned *PredCost = nullptr) const override; - - int getInstrLatency(const InstrItineraryData *ItinData, - SDNode *Node) const override { return 1;} - - bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const override; - - /// \brief Reserve the registers that may be accesed using indirect addressing. - void reserveIndirectRegisters(BitVector &Reserved, - const MachineFunction &MF) const; - - unsigned calculateIndirectAddress(unsigned RegIndex, - unsigned Channel) const override; - - const TargetRegisterClass *getIndirectAddrRegClass() const override; - - MachineInstrBuilder buildIndirectWrite(MachineBasicBlock *MBB, - MachineBasicBlock::iterator I, - unsigned ValueReg, unsigned Address, - unsigned OffsetReg) const override; - - MachineInstrBuilder buildIndirectRead(MachineBasicBlock *MBB, - MachineBasicBlock::iterator I, - unsigned ValueReg, unsigned Address, - unsigned OffsetReg) const override; - - unsigned getMaxAlusPerClause() const; - - ///buildDefaultInstruction - This function returns a MachineInstr with - /// all the instruction modifiers initialized to their default values. - /// You can use this function to avoid manually specifying each instruction - /// modifier operand when building a new instruction. - /// - /// \returns a MachineInstr with all the instruction modifiers initialized - /// to their default values. - MachineInstrBuilder buildDefaultInstruction(MachineBasicBlock &MBB, - MachineBasicBlock::iterator I, - unsigned Opcode, - unsigned DstReg, - unsigned Src0Reg, - unsigned Src1Reg = 0) const; - - MachineInstr *buildSlotOfVectorInstruction(MachineBasicBlock &MBB, - MachineInstr *MI, - unsigned Slot, - unsigned DstReg) const; - - MachineInstr *buildMovImm(MachineBasicBlock &BB, - MachineBasicBlock::iterator I, - unsigned DstReg, - uint64_t Imm) const; - - MachineInstr *buildMovInstr(MachineBasicBlock *MBB, - MachineBasicBlock::iterator I, - unsigned DstReg, unsigned SrcReg) const override; - - /// \brief Get the index of Op in the MachineInstr. - /// - /// \returns -1 if the Instruction does not contain the specified \p Op. - int getOperandIdx(const MachineInstr &MI, unsigned Op) const; - - /// \brief Get the index of \p Op for the given Opcode. - /// - /// \returns -1 if the Instruction does not contain the specified \p Op. - int getOperandIdx(unsigned Opcode, unsigned Op) const; - - /// \brief Helper function for setting instruction flag values. - void setImmOperand(MachineInstr *MI, unsigned Op, int64_t Imm) const; - - /// \returns true if this instruction has an operand for storing target flags. - bool hasFlagOperand(const MachineInstr &MI) const; - - ///\brief Add one of the MO_FLAG* flags to the specified \p Operand. - void addFlag(MachineInstr *MI, unsigned Operand, unsigned Flag) const; - - ///\brief Determine if the specified \p Flag is set on this \p Operand. - bool isFlagSet(const MachineInstr &MI, unsigned Operand, unsigned Flag) const; - - /// \param SrcIdx The register source to set the flag on (e.g src0, src1, src2) - /// \param Flag The flag being set. - /// - /// \returns the operand containing the flags for this instruction. - MachineOperand &getFlagOp(MachineInstr *MI, unsigned SrcIdx = 0, - unsigned Flag = 0) const; - - /// \brief Clear the specified flag on the instruction. - void clearFlag(MachineInstr *MI, unsigned Operand, unsigned Flag) const; -}; - -namespace AMDGPU { - -int getLDSNoRetOp(uint16_t Opcode); - -} //End namespace AMDGPU - -} // End llvm namespace - -#endif Index: projects/clang370-import/contrib/llvm/lib/Target/R600/VIInstrFormats.td =================================================================== --- projects/clang370-import/contrib/llvm/lib/Target/R600/VIInstrFormats.td (revision 288077) +++ projects/clang370-import/contrib/llvm/lib/Target/R600/VIInstrFormats.td (nonexistent) @@ -1,166 +0,0 @@ -//===-- VIInstrFormats.td - VI Instruction Encodings ----------------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// VI Instruction format definitions. -// -//===----------------------------------------------------------------------===// - -class DSe_vi op> : Enc64 { - bits<8> vdst; - bits<1> gds; - bits<8> addr; - bits<8> data0; - bits<8> data1; - bits<8> offset0; - bits<8> offset1; - - let Inst{7-0} = offset0; - let Inst{15-8} = offset1; - let Inst{16} = gds; - let Inst{24-17} = op; - let Inst{31-26} = 0x36; //encoding - let Inst{39-32} = addr; - let Inst{47-40} = data0; - let Inst{55-48} = data1; - let Inst{63-56} = vdst; -} - -class MUBUFe_vi op> : Enc64 { - bits<12> offset; - bits<1> offen; - bits<1> idxen; - bits<1> glc; - bits<1> lds; - bits<8> vaddr; - bits<8> vdata; - bits<7> srsrc; - bits<1> slc; - bits<1> tfe; - bits<8> soffset; - - let Inst{11-0} = offset; - let Inst{12} = offen; - let Inst{13} = idxen; - let Inst{14} = glc; - let Inst{16} = lds; - let Inst{17} = slc; - let Inst{24-18} = op; - let Inst{31-26} = 0x38; //encoding - let Inst{39-32} = vaddr; - let Inst{47-40} = vdata; - let Inst{52-48} = srsrc{6-2}; - let Inst{55} = tfe; - let Inst{63-56} = soffset; -} - -class MTBUFe_vi op> : Enc64 { - bits<12> offset; - bits<1> offen; - bits<1> idxen; - bits<1> glc; - bits<4> dfmt; - bits<3> nfmt; - bits<8> vaddr; - bits<8> vdata; - bits<7> srsrc; - bits<1> slc; - bits<1> tfe; - bits<8> soffset; - - let Inst{11-0} = offset; - let Inst{12} = offen; - let Inst{13} = idxen; - let Inst{14} = glc; - let Inst{18-15} = op; - let Inst{22-19} = dfmt; - let Inst{25-23} = nfmt; - let Inst{31-26} = 0x3a; //encoding - let Inst{39-32} = vaddr; - let Inst{47-40} = vdata; - let Inst{52-48} = srsrc{6-2}; - let Inst{54} = slc; - let Inst{55} = tfe; - let Inst{63-56} = soffset; -} - -class SMEMe_vi op, bit imm> : Enc64 { - bits<7> sbase; - bits<7> sdata; - bits<1> glc; - bits<20> offset; - - let Inst{5-0} = sbase{6-1}; - let Inst{12-6} = sdata; - let Inst{16} = glc; - let Inst{17} = imm; - let Inst{25-18} = op; - let Inst{31-26} = 0x30; //encoding - let Inst{51-32} = offset; -} - -class VOP3e_vi op> : Enc64 { - bits<8> vdst; - bits<2> src0_modifiers; - bits<9> src0; - bits<2> src1_modifiers; - bits<9> src1; - bits<2> src2_modifiers; - bits<9> src2; - bits<1> clamp; - bits<2> omod; - - let Inst{7-0} = vdst; - let Inst{8} = src0_modifiers{1}; - let Inst{9} = src1_modifiers{1}; - let Inst{10} = src2_modifiers{1}; - let Inst{15} = clamp; - let Inst{25-16} = op; - let Inst{31-26} = 0x34; //encoding - let Inst{40-32} = src0; - let Inst{49-41} = src1; - let Inst{58-50} = src2; - let Inst{60-59} = omod; - let Inst{61} = src0_modifiers{0}; - let Inst{62} = src1_modifiers{0}; - let Inst{63} = src2_modifiers{0}; -} - -class VOP3be_vi op> : Enc64 { - bits<8> vdst; - bits<2> src0_modifiers; - bits<9> src0; - bits<2> src1_modifiers; - bits<9> src1; - bits<2> src2_modifiers; - bits<9> src2; - bits<7> sdst; - bits<2> omod; - bits<1> clamp; - - let Inst{7-0} = vdst; - let Inst{14-8} = sdst; - let Inst{15} = clamp; - let Inst{25-16} = op; - let Inst{31-26} = 0x34; //encoding - let Inst{40-32} = src0; - let Inst{49-41} = src1; - let Inst{58-50} = src2; - let Inst{60-59} = omod; - let Inst{61} = src0_modifiers{0}; - let Inst{62} = src1_modifiers{0}; - let Inst{63} = src2_modifiers{0}; -} - -class EXPe_vi : EXPe { - let Inst{31-26} = 0x31; //encoding -} - -class VINTRPe_vi op> : VINTRPe { - let Inst{31-26} = 0x35; // encoding -} Index: projects/clang370-import/contrib/llvm/lib/Target/R600/R600ISelLowering.cpp =================================================================== --- projects/clang370-import/contrib/llvm/lib/Target/R600/R600ISelLowering.cpp (revision 288077) +++ projects/clang370-import/contrib/llvm/lib/Target/R600/R600ISelLowering.cpp (nonexistent) @@ -1,2286 +0,0 @@ -//===-- R600ISelLowering.cpp - R600 DAG Lowering Implementation -----------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -/// \file -/// \brief Custom DAG lowering for R600 -// -//===----------------------------------------------------------------------===// - -#include "R600ISelLowering.h" -#include "AMDGPUFrameLowering.h" -#include "AMDGPUIntrinsicInfo.h" -#include "AMDGPUSubtarget.h" -#include "R600Defines.h" -#include "R600InstrInfo.h" -#include "R600MachineFunctionInfo.h" -#include "llvm/Analysis/ValueTracking.h" -#include "llvm/CodeGen/CallingConvLower.h" -#include "llvm/CodeGen/MachineFrameInfo.h" -#include "llvm/CodeGen/MachineInstrBuilder.h" -#include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/CodeGen/SelectionDAG.h" -#include "llvm/IR/Argument.h" -#include "llvm/IR/Function.h" - -using namespace llvm; - -R600TargetLowering::R600TargetLowering(TargetMachine &TM, - const AMDGPUSubtarget &STI) - : AMDGPUTargetLowering(TM, STI), Gen(STI.getGeneration()) { - addRegisterClass(MVT::v4f32, &AMDGPU::R600_Reg128RegClass); - addRegisterClass(MVT::f32, &AMDGPU::R600_Reg32RegClass); - addRegisterClass(MVT::v4i32, &AMDGPU::R600_Reg128RegClass); - addRegisterClass(MVT::i32, &AMDGPU::R600_Reg32RegClass); - addRegisterClass(MVT::v2f32, &AMDGPU::R600_Reg64RegClass); - addRegisterClass(MVT::v2i32, &AMDGPU::R600_Reg64RegClass); - - computeRegisterProperties(STI.getRegisterInfo()); - - // Set condition code actions - setCondCodeAction(ISD::SETO, MVT::f32, Expand); - setCondCodeAction(ISD::SETUO, MVT::f32, Expand); - setCondCodeAction(ISD::SETLT, MVT::f32, Expand); - setCondCodeAction(ISD::SETLE, MVT::f32, Expand); - setCondCodeAction(ISD::SETOLT, MVT::f32, Expand); - setCondCodeAction(ISD::SETOLE, MVT::f32, Expand); - setCondCodeAction(ISD::SETONE, MVT::f32, Expand); - setCondCodeAction(ISD::SETUEQ, MVT::f32, Expand); - setCondCodeAction(ISD::SETUGE, MVT::f32, Expand); - setCondCodeAction(ISD::SETUGT, MVT::f32, Expand); - setCondCodeAction(ISD::SETULT, MVT::f32, Expand); - setCondCodeAction(ISD::SETULE, MVT::f32, Expand); - - setCondCodeAction(ISD::SETLE, MVT::i32, Expand); - setCondCodeAction(ISD::SETLT, MVT::i32, Expand); - setCondCodeAction(ISD::SETULE, MVT::i32, Expand); - setCondCodeAction(ISD::SETULT, MVT::i32, Expand); - - setOperationAction(ISD::FCOS, MVT::f32, Custom); - setOperationAction(ISD::FSIN, MVT::f32, Custom); - - setOperationAction(ISD::SETCC, MVT::v4i32, Expand); - setOperationAction(ISD::SETCC, MVT::v2i32, Expand); - - setOperationAction(ISD::BR_CC, MVT::i32, Expand); - setOperationAction(ISD::BR_CC, MVT::f32, Expand); - setOperationAction(ISD::BRCOND, MVT::Other, Custom); - - setOperationAction(ISD::FSUB, MVT::f32, Expand); - - setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); - setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); - setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i1, Custom); - - setOperationAction(ISD::SELECT_CC, MVT::f32, Custom); - setOperationAction(ISD::SELECT_CC, MVT::i32, Custom); - - setOperationAction(ISD::SETCC, MVT::i32, Expand); - setOperationAction(ISD::SETCC, MVT::f32, Expand); - setOperationAction(ISD::FP_TO_UINT, MVT::i1, Custom); - setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom); - setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom); - - setOperationAction(ISD::SELECT, MVT::i32, Expand); - setOperationAction(ISD::SELECT, MVT::f32, Expand); - setOperationAction(ISD::SELECT, MVT::v2i32, Expand); - setOperationAction(ISD::SELECT, MVT::v4i32, Expand); - - // ADD, SUB overflow. - // TODO: turn these into Legal? - if (Subtarget->hasCARRY()) - setOperationAction(ISD::UADDO, MVT::i32, Custom); - - if (Subtarget->hasBORROW()) - setOperationAction(ISD::USUBO, MVT::i32, Custom); - - // Expand sign extension of vectors - if (!Subtarget->hasBFE()) - setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand); - - setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Expand); - setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i1, Expand); - - if (!Subtarget->hasBFE()) - setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Expand); - setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Expand); - setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Expand); - - if (!Subtarget->hasBFE()) - setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand); - setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Expand); - setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Expand); - - setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal); - setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i32, Expand); - setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i32, Expand); - - setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Expand); - - - // Legalize loads and stores to the private address space. - setOperationAction(ISD::LOAD, MVT::i32, Custom); - setOperationAction(ISD::LOAD, MVT::v2i32, Custom); - setOperationAction(ISD::LOAD, MVT::v4i32, Custom); - - // EXTLOAD should be the same as ZEXTLOAD. It is legal for some address - // spaces, so it is custom lowered to handle those where it isn't. - for (MVT VT : MVT::integer_valuetypes()) { - setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote); - setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Custom); - setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i16, Custom); - - setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote); - setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i8, Custom); - setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i16, Custom); - - setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote); - setLoadExtAction(ISD::EXTLOAD, VT, MVT::i8, Custom); - setLoadExtAction(ISD::EXTLOAD, VT, MVT::i16, Custom); - } - - setOperationAction(ISD::STORE, MVT::i8, Custom); - setOperationAction(ISD::STORE, MVT::i32, Custom); - setOperationAction(ISD::STORE, MVT::v2i32, Custom); - setOperationAction(ISD::STORE, MVT::v4i32, Custom); - setTruncStoreAction(MVT::i32, MVT::i8, Custom); - setTruncStoreAction(MVT::i32, MVT::i16, Custom); - - setOperationAction(ISD::LOAD, MVT::i32, Custom); - setOperationAction(ISD::LOAD, MVT::v4i32, Custom); - setOperationAction(ISD::FrameIndex, MVT::i32, Custom); - - setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i32, Custom); - setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f32, Custom); - setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom); - setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); - - setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i32, Custom); - setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f32, Custom); - setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); - setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); - - setTargetDAGCombine(ISD::FP_ROUND); - setTargetDAGCombine(ISD::FP_TO_SINT); - setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); - setTargetDAGCombine(ISD::SELECT_CC); - setTargetDAGCombine(ISD::INSERT_VECTOR_ELT); - - // We don't have 64-bit shifts. Thus we need either SHX i64 or SHX_PARTS i32 - // to be Legal/Custom in order to avoid library calls. - setOperationAction(ISD::SHL_PARTS, MVT::i32, Custom); - setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom); - setOperationAction(ISD::SRA_PARTS, MVT::i32, Custom); - - setOperationAction(ISD::GlobalAddress, MVT::i32, Custom); - - const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 }; - for (MVT VT : ScalarIntVTs) { - setOperationAction(ISD::ADDC, VT, Expand); - setOperationAction(ISD::SUBC, VT, Expand); - setOperationAction(ISD::ADDE, VT, Expand); - setOperationAction(ISD::SUBE, VT, Expand); - } - - setSchedulingPreference(Sched::Source); -} - -MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter( - MachineInstr * MI, MachineBasicBlock * BB) const { - MachineFunction * MF = BB->getParent(); - MachineRegisterInfo &MRI = MF->getRegInfo(); - MachineBasicBlock::iterator I = *MI; - const R600InstrInfo *TII = - static_cast(Subtarget->getInstrInfo()); - - switch (MI->getOpcode()) { - default: - // Replace LDS_*_RET instruction that don't have any uses with the - // equivalent LDS_*_NORET instruction. - if (TII->isLDSRetInstr(MI->getOpcode())) { - int DstIdx = TII->getOperandIdx(MI->getOpcode(), AMDGPU::OpName::dst); - assert(DstIdx != -1); - MachineInstrBuilder NewMI; - // FIXME: getLDSNoRetOp method only handles LDS_1A1D LDS ops. Add - // LDS_1A2D support and remove this special case. - if (!MRI.use_empty(MI->getOperand(DstIdx).getReg()) || - MI->getOpcode() == AMDGPU::LDS_CMPST_RET) - return BB; - - NewMI = BuildMI(*BB, I, BB->findDebugLoc(I), - TII->get(AMDGPU::getLDSNoRetOp(MI->getOpcode()))); - for (unsigned i = 1, e = MI->getNumOperands(); i < e; ++i) { - NewMI.addOperand(MI->getOperand(i)); - } - } else { - return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB); - } - break; - case AMDGPU::CLAMP_R600: { - MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I, - AMDGPU::MOV, - MI->getOperand(0).getReg(), - MI->getOperand(1).getReg()); - TII->addFlag(NewMI, 0, MO_FLAG_CLAMP); - break; - } - - case AMDGPU::FABS_R600: { - MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I, - AMDGPU::MOV, - MI->getOperand(0).getReg(), - MI->getOperand(1).getReg()); - TII->addFlag(NewMI, 0, MO_FLAG_ABS); - break; - } - - case AMDGPU::FNEG_R600: { - MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I, - AMDGPU::MOV, - MI->getOperand(0).getReg(), - MI->getOperand(1).getReg()); - TII->addFlag(NewMI, 0, MO_FLAG_NEG); - break; - } - - case AMDGPU::MASK_WRITE: { - unsigned maskedRegister = MI->getOperand(0).getReg(); - assert(TargetRegisterInfo::isVirtualRegister(maskedRegister)); - MachineInstr * defInstr = MRI.getVRegDef(maskedRegister); - TII->addFlag(defInstr, 0, MO_FLAG_MASK); - break; - } - - case AMDGPU::MOV_IMM_F32: - TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(), - MI->getOperand(1).getFPImm()->getValueAPF() - .bitcastToAPInt().getZExtValue()); - break; - case AMDGPU::MOV_IMM_I32: - TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(), - MI->getOperand(1).getImm()); - break; - case AMDGPU::CONST_COPY: { - MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, MI, AMDGPU::MOV, - MI->getOperand(0).getReg(), AMDGPU::ALU_CONST); - TII->setImmOperand(NewMI, AMDGPU::OpName::src0_sel, - MI->getOperand(1).getImm()); - break; - } - - case AMDGPU::RAT_WRITE_CACHELESS_32_eg: - case AMDGPU::RAT_WRITE_CACHELESS_64_eg: - case AMDGPU::RAT_WRITE_CACHELESS_128_eg: { - unsigned EOP = (std::next(I)->getOpcode() == AMDGPU::RETURN) ? 1 : 0; - - BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode())) - .addOperand(MI->getOperand(0)) - .addOperand(MI->getOperand(1)) - .addImm(EOP); // Set End of program bit - break; - } - - case AMDGPU::TXD: { - unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass); - unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass); - MachineOperand &RID = MI->getOperand(4); - MachineOperand &SID = MI->getOperand(5); - unsigned TextureId = MI->getOperand(6).getImm(); - unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3; - unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1; - - switch (TextureId) { - case 5: // Rect - CTX = CTY = 0; - break; - case 6: // Shadow1D - SrcW = SrcZ; - break; - case 7: // Shadow2D - SrcW = SrcZ; - break; - case 8: // ShadowRect - CTX = CTY = 0; - SrcW = SrcZ; - break; - case 9: // 1DArray - SrcZ = SrcY; - CTZ = 0; - break; - case 10: // 2DArray - CTZ = 0; - break; - case 11: // Shadow1DArray - SrcZ = SrcY; - CTZ = 0; - break; - case 12: // Shadow2DArray - CTZ = 0; - break; - } - BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0) - .addOperand(MI->getOperand(3)) - .addImm(SrcX) - .addImm(SrcY) - .addImm(SrcZ) - .addImm(SrcW) - .addImm(0) - .addImm(0) - .addImm(0) - .addImm(0) - .addImm(1) - .addImm(2) - .addImm(3) - .addOperand(RID) - .addOperand(SID) - .addImm(CTX) - .addImm(CTY) - .addImm(CTZ) - .addImm(CTW); - BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1) - .addOperand(MI->getOperand(2)) - .addImm(SrcX) - .addImm(SrcY) - .addImm(SrcZ) - .addImm(SrcW) - .addImm(0) - .addImm(0) - .addImm(0) - .addImm(0) - .addImm(1) - .addImm(2) - .addImm(3) - .addOperand(RID) - .addOperand(SID) - .addImm(CTX) - .addImm(CTY) - .addImm(CTZ) - .addImm(CTW); - BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_G)) - .addOperand(MI->getOperand(0)) - .addOperand(MI->getOperand(1)) - .addImm(SrcX) - .addImm(SrcY) - .addImm(SrcZ) - .addImm(SrcW) - .addImm(0) - .addImm(0) - .addImm(0) - .addImm(0) - .addImm(1) - .addImm(2) - .addImm(3) - .addOperand(RID) - .addOperand(SID) - .addImm(CTX) - .addImm(CTY) - .addImm(CTZ) - .addImm(CTW) - .addReg(T0, RegState::Implicit) - .addReg(T1, RegState::Implicit); - break; - } - - case AMDGPU::TXD_SHADOW: { - unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass); - unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass); - MachineOperand &RID = MI->getOperand(4); - MachineOperand &SID = MI->getOperand(5); - unsigned TextureId = MI->getOperand(6).getImm(); - unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3; - unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1; - - switch (TextureId) { - case 5: // Rect - CTX = CTY = 0; - break; - case 6: // Shadow1D - SrcW = SrcZ; - break; - case 7: // Shadow2D - SrcW = SrcZ; - break; - case 8: // ShadowRect - CTX = CTY = 0; - SrcW = SrcZ; - break; - case 9: // 1DArray - SrcZ = SrcY; - CTZ = 0; - break; - case 10: // 2DArray - CTZ = 0; - break; - case 11: // Shadow1DArray - SrcZ = SrcY; - CTZ = 0; - break; - case 12: // Shadow2DArray - CTZ = 0; - break; - } - - BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0) - .addOperand(MI->getOperand(3)) - .addImm(SrcX) - .addImm(SrcY) - .addImm(SrcZ) - .addImm(SrcW) - .addImm(0) - .addImm(0) - .addImm(0) - .addImm(0) - .addImm(1) - .addImm(2) - .addImm(3) - .addOperand(RID) - .addOperand(SID) - .addImm(CTX) - .addImm(CTY) - .addImm(CTZ) - .addImm(CTW); - BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1) - .addOperand(MI->getOperand(2)) - .addImm(SrcX) - .addImm(SrcY) - .addImm(SrcZ) - .addImm(SrcW) - .addImm(0) - .addImm(0) - .addImm(0) - .addImm(0) - .addImm(1) - .addImm(2) - .addImm(3) - .addOperand(RID) - .addOperand(SID) - .addImm(CTX) - .addImm(CTY) - .addImm(CTZ) - .addImm(CTW); - BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_C_G)) - .addOperand(MI->getOperand(0)) - .addOperand(MI->getOperand(1)) - .addImm(SrcX) - .addImm(SrcY) - .addImm(SrcZ) - .addImm(SrcW) - .addImm(0) - .addImm(0) - .addImm(0) - .addImm(0) - .addImm(1) - .addImm(2) - .addImm(3) - .addOperand(RID) - .addOperand(SID) - .addImm(CTX) - .addImm(CTY) - .addImm(CTZ) - .addImm(CTW) - .addReg(T0, RegState::Implicit) - .addReg(T1, RegState::Implicit); - break; - } - - case AMDGPU::BRANCH: - BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP)) - .addOperand(MI->getOperand(0)); - break; - - case AMDGPU::BRANCH_COND_f32: { - MachineInstr *NewMI = - BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X), - AMDGPU::PREDICATE_BIT) - .addOperand(MI->getOperand(1)) - .addImm(OPCODE_IS_NOT_ZERO) - .addImm(0); // Flags - TII->addFlag(NewMI, 0, MO_FLAG_PUSH); - BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND)) - .addOperand(MI->getOperand(0)) - .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill); - break; - } - - case AMDGPU::BRANCH_COND_i32: { - MachineInstr *NewMI = - BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X), - AMDGPU::PREDICATE_BIT) - .addOperand(MI->getOperand(1)) - .addImm(OPCODE_IS_NOT_ZERO_INT) - .addImm(0); // Flags - TII->addFlag(NewMI, 0, MO_FLAG_PUSH); - BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND)) - .addOperand(MI->getOperand(0)) - .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill); - break; - } - - case AMDGPU::EG_ExportSwz: - case AMDGPU::R600_ExportSwz: { - // Instruction is left unmodified if its not the last one of its type - bool isLastInstructionOfItsType = true; - unsigned InstExportType = MI->getOperand(1).getImm(); - for (MachineBasicBlock::iterator NextExportInst = std::next(I), - EndBlock = BB->end(); NextExportInst != EndBlock; - NextExportInst = std::next(NextExportInst)) { - if (NextExportInst->getOpcode() == AMDGPU::EG_ExportSwz || - NextExportInst->getOpcode() == AMDGPU::R600_ExportSwz) { - unsigned CurrentInstExportType = NextExportInst->getOperand(1) - .getImm(); - if (CurrentInstExportType == InstExportType) { - isLastInstructionOfItsType = false; - break; - } - } - } - bool EOP = (std::next(I)->getOpcode() == AMDGPU::RETURN) ? 1 : 0; - if (!EOP && !isLastInstructionOfItsType) - return BB; - unsigned CfInst = (MI->getOpcode() == AMDGPU::EG_ExportSwz)? 84 : 40; - BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode())) - .addOperand(MI->getOperand(0)) - .addOperand(MI->getOperand(1)) - .addOperand(MI->getOperand(2)) - .addOperand(MI->getOperand(3)) - .addOperand(MI->getOperand(4)) - .addOperand(MI->getOperand(5)) - .addOperand(MI->getOperand(6)) - .addImm(CfInst) - .addImm(EOP); - break; - } - case AMDGPU::RETURN: { - // RETURN instructions must have the live-out registers as implicit uses, - // otherwise they appear dead. - R600MachineFunctionInfo *MFI = MF->getInfo(); - MachineInstrBuilder MIB(*MF, MI); - for (unsigned i = 0, e = MFI->LiveOuts.size(); i != e; ++i) - MIB.addReg(MFI->LiveOuts[i], RegState::Implicit); - return BB; - } - } - - MI->eraseFromParent(); - return BB; -} - -//===----------------------------------------------------------------------===// -// Custom DAG Lowering Operations -//===----------------------------------------------------------------------===// - -SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { - MachineFunction &MF = DAG.getMachineFunction(); - R600MachineFunctionInfo *MFI = MF.getInfo(); - switch (Op.getOpcode()) { - default: return AMDGPUTargetLowering::LowerOperation(Op, DAG); - case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG); - case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG); - case ISD::SHL_PARTS: return LowerSHLParts(Op, DAG); - case ISD::SRA_PARTS: - case ISD::SRL_PARTS: return LowerSRXParts(Op, DAG); - case ISD::UADDO: return LowerUADDSUBO(Op, DAG, ISD::ADD, AMDGPUISD::CARRY); - case ISD::USUBO: return LowerUADDSUBO(Op, DAG, ISD::SUB, AMDGPUISD::BORROW); - case ISD::FCOS: - case ISD::FSIN: return LowerTrig(Op, DAG); - case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG); - case ISD::STORE: return LowerSTORE(Op, DAG); - case ISD::LOAD: { - SDValue Result = LowerLOAD(Op, DAG); - assert((!Result.getNode() || - Result.getNode()->getNumValues() == 2) && - "Load should return a value and a chain"); - return Result; - } - - case ISD::BRCOND: return LowerBRCOND(Op, DAG); - case ISD::GlobalAddress: return LowerGlobalAddress(MFI, Op, DAG); - case ISD::INTRINSIC_VOID: { - SDValue Chain = Op.getOperand(0); - unsigned IntrinsicID = - cast(Op.getOperand(1))->getZExtValue(); - switch (IntrinsicID) { - case AMDGPUIntrinsic::AMDGPU_store_output: { - int64_t RegIndex = cast(Op.getOperand(3))->getZExtValue(); - unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex); - MFI->LiveOuts.push_back(Reg); - return DAG.getCopyToReg(Chain, SDLoc(Op), Reg, Op.getOperand(2)); - } - case AMDGPUIntrinsic::R600_store_swizzle: { - SDLoc DL(Op); - const SDValue Args[8] = { - Chain, - Op.getOperand(2), // Export Value - Op.getOperand(3), // ArrayBase - Op.getOperand(4), // Type - DAG.getConstant(0, DL, MVT::i32), // SWZ_X - DAG.getConstant(1, DL, MVT::i32), // SWZ_Y - DAG.getConstant(2, DL, MVT::i32), // SWZ_Z - DAG.getConstant(3, DL, MVT::i32) // SWZ_W - }; - return DAG.getNode(AMDGPUISD::EXPORT, DL, Op.getValueType(), Args); - } - - // default for switch(IntrinsicID) - default: break; - } - // break out of case ISD::INTRINSIC_VOID in switch(Op.getOpcode()) - break; - } - case ISD::INTRINSIC_WO_CHAIN: { - unsigned IntrinsicID = - cast(Op.getOperand(0))->getZExtValue(); - EVT VT = Op.getValueType(); - SDLoc DL(Op); - switch(IntrinsicID) { - default: return AMDGPUTargetLowering::LowerOperation(Op, DAG); - case AMDGPUIntrinsic::R600_load_input: { - int64_t RegIndex = cast(Op.getOperand(1))->getZExtValue(); - unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex); - MachineFunction &MF = DAG.getMachineFunction(); - MachineRegisterInfo &MRI = MF.getRegInfo(); - MRI.addLiveIn(Reg); - return DAG.getCopyFromReg(DAG.getEntryNode(), - SDLoc(DAG.getEntryNode()), Reg, VT); - } - - case AMDGPUIntrinsic::R600_interp_input: { - int slot = cast(Op.getOperand(1))->getZExtValue(); - int ijb = cast(Op.getOperand(2))->getSExtValue(); - MachineSDNode *interp; - if (ijb < 0) { - const R600InstrInfo *TII = - static_cast(Subtarget->getInstrInfo()); - interp = DAG.getMachineNode(AMDGPU::INTERP_VEC_LOAD, DL, - MVT::v4f32, DAG.getTargetConstant(slot / 4, DL, MVT::i32)); - return DAG.getTargetExtractSubreg( - TII->getRegisterInfo().getSubRegFromChannel(slot % 4), - DL, MVT::f32, SDValue(interp, 0)); - } - MachineFunction &MF = DAG.getMachineFunction(); - MachineRegisterInfo &MRI = MF.getRegInfo(); - unsigned RegisterI = AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb); - unsigned RegisterJ = AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb + 1); - MRI.addLiveIn(RegisterI); - MRI.addLiveIn(RegisterJ); - SDValue RegisterINode = DAG.getCopyFromReg(DAG.getEntryNode(), - SDLoc(DAG.getEntryNode()), RegisterI, MVT::f32); - SDValue RegisterJNode = DAG.getCopyFromReg(DAG.getEntryNode(), - SDLoc(DAG.getEntryNode()), RegisterJ, MVT::f32); - - if (slot % 4 < 2) - interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_XY, DL, - MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4, DL, MVT::i32), - RegisterJNode, RegisterINode); - else - interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_ZW, DL, - MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4, DL, MVT::i32), - RegisterJNode, RegisterINode); - return SDValue(interp, slot % 2); - } - case AMDGPUIntrinsic::R600_interp_xy: - case AMDGPUIntrinsic::R600_interp_zw: { - int slot = cast(Op.getOperand(1))->getZExtValue(); - MachineSDNode *interp; - SDValue RegisterINode = Op.getOperand(2); - SDValue RegisterJNode = Op.getOperand(3); - - if (IntrinsicID == AMDGPUIntrinsic::R600_interp_xy) - interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_XY, DL, - MVT::f32, MVT::f32, DAG.getTargetConstant(slot, DL, MVT::i32), - RegisterJNode, RegisterINode); - else - interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_ZW, DL, - MVT::f32, MVT::f32, DAG.getTargetConstant(slot, DL, MVT::i32), - RegisterJNode, RegisterINode); - return DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f32, - SDValue(interp, 0), SDValue(interp, 1)); - } - case AMDGPUIntrinsic::R600_tex: - case AMDGPUIntrinsic::R600_texc: - case AMDGPUIntrinsic::R600_txl: - case AMDGPUIntrinsic::R600_txlc: - case AMDGPUIntrinsic::R600_txb: - case AMDGPUIntrinsic::R600_txbc: - case AMDGPUIntrinsic::R600_txf: - case AMDGPUIntrinsic::R600_txq: - case AMDGPUIntrinsic::R600_ddx: - case AMDGPUIntrinsic::R600_ddy: - case AMDGPUIntrinsic::R600_ldptr: { - unsigned TextureOp; - switch (IntrinsicID) { - case AMDGPUIntrinsic::R600_tex: - TextureOp = 0; - break; - case AMDGPUIntrinsic::R600_texc: - TextureOp = 1; - break; - case AMDGPUIntrinsic::R600_txl: - TextureOp = 2; - break; - case AMDGPUIntrinsic::R600_txlc: - TextureOp = 3; - break; - case AMDGPUIntrinsic::R600_txb: - TextureOp = 4; - break; - case AMDGPUIntrinsic::R600_txbc: - TextureOp = 5; - break; - case AMDGPUIntrinsic::R600_txf: - TextureOp = 6; - break; - case AMDGPUIntrinsic::R600_txq: - TextureOp = 7; - break; - case AMDGPUIntrinsic::R600_ddx: - TextureOp = 8; - break; - case AMDGPUIntrinsic::R600_ddy: - TextureOp = 9; - break; - case AMDGPUIntrinsic::R600_ldptr: - TextureOp = 10; - break; - default: - llvm_unreachable("Unknow Texture Operation"); - } - - SDValue TexArgs[19] = { - DAG.getConstant(TextureOp, DL, MVT::i32), - Op.getOperand(1), - DAG.getConstant(0, DL, MVT::i32), - DAG.getConstant(1, DL, MVT::i32), - DAG.getConstant(2, DL, MVT::i32), - DAG.getConstant(3, DL, MVT::i32), - Op.getOperand(2), - Op.getOperand(3), - Op.getOperand(4), - DAG.getConstant(0, DL, MVT::i32), - DAG.getConstant(1, DL, MVT::i32), - DAG.getConstant(2, DL, MVT::i32), - DAG.getConstant(3, DL, MVT::i32), - Op.getOperand(5), - Op.getOperand(6), - Op.getOperand(7), - Op.getOperand(8), - Op.getOperand(9), - Op.getOperand(10) - }; - return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, DL, MVT::v4f32, TexArgs); - } - case AMDGPUIntrinsic::AMDGPU_dp4: { - SDValue Args[8] = { - DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1), - DAG.getConstant(0, DL, MVT::i32)), - DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2), - DAG.getConstant(0, DL, MVT::i32)), - DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1), - DAG.getConstant(1, DL, MVT::i32)), - DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2), - DAG.getConstant(1, DL, MVT::i32)), - DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1), - DAG.getConstant(2, DL, MVT::i32)), - DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2), - DAG.getConstant(2, DL, MVT::i32)), - DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1), - DAG.getConstant(3, DL, MVT::i32)), - DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2), - DAG.getConstant(3, DL, MVT::i32)) - }; - return DAG.getNode(AMDGPUISD::DOT4, DL, MVT::f32, Args); - } - - case Intrinsic::r600_read_ngroups_x: - return LowerImplicitParameter(DAG, VT, DL, 0); - case Intrinsic::r600_read_ngroups_y: - return LowerImplicitParameter(DAG, VT, DL, 1); - case Intrinsic::r600_read_ngroups_z: - return LowerImplicitParameter(DAG, VT, DL, 2); - case Intrinsic::r600_read_global_size_x: - return LowerImplicitParameter(DAG, VT, DL, 3); - case Intrinsic::r600_read_global_size_y: - return LowerImplicitParameter(DAG, VT, DL, 4); - case Intrinsic::r600_read_global_size_z: - return LowerImplicitParameter(DAG, VT, DL, 5); - case Intrinsic::r600_read_local_size_x: - return LowerImplicitParameter(DAG, VT, DL, 6); - case Intrinsic::r600_read_local_size_y: - return LowerImplicitParameter(DAG, VT, DL, 7); - case Intrinsic::r600_read_local_size_z: - return LowerImplicitParameter(DAG, VT, DL, 8); - - case Intrinsic::AMDGPU_read_workdim: - return LowerImplicitParameter(DAG, VT, DL, MFI->ABIArgOffset / 4); - - case Intrinsic::r600_read_tgid_x: - return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, - AMDGPU::T1_X, VT); - case Intrinsic::r600_read_tgid_y: - return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, - AMDGPU::T1_Y, VT); - case Intrinsic::r600_read_tgid_z: - return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, - AMDGPU::T1_Z, VT); - case Intrinsic::r600_read_tidig_x: - return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, - AMDGPU::T0_X, VT); - case Intrinsic::r600_read_tidig_y: - return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, - AMDGPU::T0_Y, VT); - case Intrinsic::r600_read_tidig_z: - return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, - AMDGPU::T0_Z, VT); - case Intrinsic::AMDGPU_rsq: - // XXX - I'm assuming SI's RSQ_LEGACY matches R600's behavior. - return DAG.getNode(AMDGPUISD::RSQ_LEGACY, DL, VT, Op.getOperand(1)); - - case AMDGPUIntrinsic::AMDGPU_fract: - case AMDGPUIntrinsic::AMDIL_fraction: // Legacy name. - return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1)); - } - // break out of case ISD::INTRINSIC_WO_CHAIN in switch(Op.getOpcode()) - break; - } - } // end switch(Op.getOpcode()) - return SDValue(); -} - -void R600TargetLowering::ReplaceNodeResults(SDNode *N, - SmallVectorImpl &Results, - SelectionDAG &DAG) const { - switch (N->getOpcode()) { - default: - AMDGPUTargetLowering::ReplaceNodeResults(N, Results, DAG); - return; - case ISD::FP_TO_UINT: - if (N->getValueType(0) == MVT::i1) { - Results.push_back(LowerFPTOUINT(N->getOperand(0), DAG)); - return; - } - // Fall-through. Since we don't care about out of bounds values - // we can use FP_TO_SINT for uints too. The DAGLegalizer code for uint - // considers some extra cases which are not necessary here. - case ISD::FP_TO_SINT: { - SDValue Result; - if (expandFP_TO_SINT(N, Result, DAG)) - Results.push_back(Result); - return; - } - case ISD::SDIVREM: { - SDValue Op = SDValue(N, 1); - SDValue RES = LowerSDIVREM(Op, DAG); - Results.push_back(RES); - Results.push_back(RES.getValue(1)); - break; - } - case ISD::UDIVREM: { - SDValue Op = SDValue(N, 0); - LowerUDIVREM64(Op, DAG, Results); - break; - } - } -} - -SDValue R600TargetLowering::vectorToVerticalVector(SelectionDAG &DAG, - SDValue Vector) const { - - SDLoc DL(Vector); - EVT VecVT = Vector.getValueType(); - EVT EltVT = VecVT.getVectorElementType(); - SmallVector Args; - - for (unsigned i = 0, e = VecVT.getVectorNumElements(); - i != e; ++i) { - Args.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Vector, - DAG.getConstant(i, DL, getVectorIdxTy()))); - } - - return DAG.getNode(AMDGPUISD::BUILD_VERTICAL_VECTOR, DL, VecVT, Args); -} - -SDValue R600TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, - SelectionDAG &DAG) const { - - SDLoc DL(Op); - SDValue Vector = Op.getOperand(0); - SDValue Index = Op.getOperand(1); - - if (isa(Index) || - Vector.getOpcode() == AMDGPUISD::BUILD_VERTICAL_VECTOR) - return Op; - - Vector = vectorToVerticalVector(DAG, Vector); - return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op.getValueType(), - Vector, Index); -} - -SDValue R600TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, - SelectionDAG &DAG) const { - SDLoc DL(Op); - SDValue Vector = Op.getOperand(0); - SDValue Value = Op.getOperand(1); - SDValue Index = Op.getOperand(2); - - if (isa(Index) || - Vector.getOpcode() == AMDGPUISD::BUILD_VERTICAL_VECTOR) - return Op; - - Vector = vectorToVerticalVector(DAG, Vector); - SDValue Insert = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, Op.getValueType(), - Vector, Value, Index); - return vectorToVerticalVector(DAG, Insert); -} - -SDValue R600TargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const { - // On hw >= R700, COS/SIN input must be between -1. and 1. - // Thus we lower them to TRIG ( FRACT ( x / 2Pi + 0.5) - 0.5) - EVT VT = Op.getValueType(); - SDValue Arg = Op.getOperand(0); - SDLoc DL(Op); - SDValue FractPart = DAG.getNode(AMDGPUISD::FRACT, DL, VT, - DAG.getNode(ISD::FADD, DL, VT, - DAG.getNode(ISD::FMUL, DL, VT, Arg, - DAG.getConstantFP(0.15915494309, DL, MVT::f32)), - DAG.getConstantFP(0.5, DL, MVT::f32))); - unsigned TrigNode; - switch (Op.getOpcode()) { - case ISD::FCOS: - TrigNode = AMDGPUISD::COS_HW; - break; - case ISD::FSIN: - TrigNode = AMDGPUISD::SIN_HW; - break; - default: - llvm_unreachable("Wrong trig opcode"); - } - SDValue TrigVal = DAG.getNode(TrigNode, DL, VT, - DAG.getNode(ISD::FADD, DL, VT, FractPart, - DAG.getConstantFP(-0.5, DL, MVT::f32))); - if (Gen >= AMDGPUSubtarget::R700) - return TrigVal; - // On R600 hw, COS/SIN input must be between -Pi and Pi. - return DAG.getNode(ISD::FMUL, DL, VT, TrigVal, - DAG.getConstantFP(3.14159265359, DL, MVT::f32)); -} - -SDValue R600TargetLowering::LowerSHLParts(SDValue Op, SelectionDAG &DAG) const { - SDLoc DL(Op); - EVT VT = Op.getValueType(); - - SDValue Lo = Op.getOperand(0); - SDValue Hi = Op.getOperand(1); - SDValue Shift = Op.getOperand(2); - SDValue Zero = DAG.getConstant(0, DL, VT); - SDValue One = DAG.getConstant(1, DL, VT); - - SDValue Width = DAG.getConstant(VT.getSizeInBits(), DL, VT); - SDValue Width1 = DAG.getConstant(VT.getSizeInBits() - 1, DL, VT); - SDValue BigShift = DAG.getNode(ISD::SUB, DL, VT, Shift, Width); - SDValue CompShift = DAG.getNode(ISD::SUB, DL, VT, Width1, Shift); - - // The dance around Width1 is necessary for 0 special case. - // Without it the CompShift might be 32, producing incorrect results in - // Overflow. So we do the shift in two steps, the alternative is to - // add a conditional to filter the special case. - - SDValue Overflow = DAG.getNode(ISD::SRL, DL, VT, Lo, CompShift); - Overflow = DAG.getNode(ISD::SRL, DL, VT, Overflow, One); - - SDValue HiSmall = DAG.getNode(ISD::SHL, DL, VT, Hi, Shift); - HiSmall = DAG.getNode(ISD::OR, DL, VT, HiSmall, Overflow); - SDValue LoSmall = DAG.getNode(ISD::SHL, DL, VT, Lo, Shift); - - SDValue HiBig = DAG.getNode(ISD::SHL, DL, VT, Lo, BigShift); - SDValue LoBig = Zero; - - Hi = DAG.getSelectCC(DL, Shift, Width, HiSmall, HiBig, ISD::SETULT); - Lo = DAG.getSelectCC(DL, Shift, Width, LoSmall, LoBig, ISD::SETULT); - - return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT,VT), Lo, Hi); -} - -SDValue R600TargetLowering::LowerSRXParts(SDValue Op, SelectionDAG &DAG) const { - SDLoc DL(Op); - EVT VT = Op.getValueType(); - - SDValue Lo = Op.getOperand(0); - SDValue Hi = Op.getOperand(1); - SDValue Shift = Op.getOperand(2); - SDValue Zero = DAG.getConstant(0, DL, VT); - SDValue One = DAG.getConstant(1, DL, VT); - - const bool SRA = Op.getOpcode() == ISD::SRA_PARTS; - - SDValue Width = DAG.getConstant(VT.getSizeInBits(), DL, VT); - SDValue Width1 = DAG.getConstant(VT.getSizeInBits() - 1, DL, VT); - SDValue BigShift = DAG.getNode(ISD::SUB, DL, VT, Shift, Width); - SDValue CompShift = DAG.getNode(ISD::SUB, DL, VT, Width1, Shift); - - // The dance around Width1 is necessary for 0 special case. - // Without it the CompShift might be 32, producing incorrect results in - // Overflow. So we do the shift in two steps, the alternative is to - // add a conditional to filter the special case. - - SDValue Overflow = DAG.getNode(ISD::SHL, DL, VT, Hi, CompShift); - Overflow = DAG.getNode(ISD::SHL, DL, VT, Overflow, One); - - SDValue HiSmall = DAG.getNode(SRA ? ISD::SRA : ISD::SRL, DL, VT, Hi, Shift); - SDValue LoSmall = DAG.getNode(ISD::SRL, DL, VT, Lo, Shift); - LoSmall = DAG.getNode(ISD::OR, DL, VT, LoSmall, Overflow); - - SDValue LoBig = DAG.getNode(SRA ? ISD::SRA : ISD::SRL, DL, VT, Hi, BigShift); - SDValue HiBig = SRA ? DAG.getNode(ISD::SRA, DL, VT, Hi, Width1) : Zero; - - Hi = DAG.getSelectCC(DL, Shift, Width, HiSmall, HiBig, ISD::SETULT); - Lo = DAG.getSelectCC(DL, Shift, Width, LoSmall, LoBig, ISD::SETULT); - - return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT,VT), Lo, Hi); -} - -SDValue R600TargetLowering::LowerUADDSUBO(SDValue Op, SelectionDAG &DAG, - unsigned mainop, unsigned ovf) const { - SDLoc DL(Op); - EVT VT = Op.getValueType(); - - SDValue Lo = Op.getOperand(0); - SDValue Hi = Op.getOperand(1); - - SDValue OVF = DAG.getNode(ovf, DL, VT, Lo, Hi); - // Extend sign. - OVF = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, OVF, - DAG.getValueType(MVT::i1)); - - SDValue Res = DAG.getNode(mainop, DL, VT, Lo, Hi); - - return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT, VT), Res, OVF); -} - -SDValue R600TargetLowering::LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const { - SDLoc DL(Op); - return DAG.getNode( - ISD::SETCC, - DL, - MVT::i1, - Op, DAG.getConstantFP(0.0f, DL, MVT::f32), - DAG.getCondCode(ISD::SETNE) - ); -} - -SDValue R600TargetLowering::LowerImplicitParameter(SelectionDAG &DAG, EVT VT, - SDLoc DL, - unsigned DwordOffset) const { - unsigned ByteOffset = DwordOffset * 4; - PointerType * PtrType = PointerType::get(VT.getTypeForEVT(*DAG.getContext()), - AMDGPUAS::CONSTANT_BUFFER_0); - - // We shouldn't be using an offset wider than 16-bits for implicit parameters. - assert(isInt<16>(ByteOffset)); - - return DAG.getLoad(VT, DL, DAG.getEntryNode(), - DAG.getConstant(ByteOffset, DL, MVT::i32), // PTR - MachinePointerInfo(ConstantPointerNull::get(PtrType)), - false, false, false, 0); -} - -bool R600TargetLowering::isZero(SDValue Op) const { - if(ConstantSDNode *Cst = dyn_cast(Op)) { - return Cst->isNullValue(); - } else if(ConstantFPSDNode *CstFP = dyn_cast(Op)){ - return CstFP->isZero(); - } else { - return false; - } -} - -SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { - SDLoc DL(Op); - EVT VT = Op.getValueType(); - - SDValue LHS = Op.getOperand(0); - SDValue RHS = Op.getOperand(1); - SDValue True = Op.getOperand(2); - SDValue False = Op.getOperand(3); - SDValue CC = Op.getOperand(4); - SDValue Temp; - - if (VT == MVT::f32) { - DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr); - SDValue MinMax = CombineFMinMaxLegacy(DL, VT, LHS, RHS, True, False, CC, DCI); - if (MinMax) - return MinMax; - } - - // LHS and RHS are guaranteed to be the same value type - EVT CompareVT = LHS.getValueType(); - - // Check if we can lower this to a native operation. - - // Try to lower to a SET* instruction: - // - // SET* can match the following patterns: - // - // select_cc f32, f32, -1, 0, cc_supported - // select_cc f32, f32, 1.0f, 0.0f, cc_supported - // select_cc i32, i32, -1, 0, cc_supported - // - - // Move hardware True/False values to the correct operand. - ISD::CondCode CCOpcode = cast(CC)->get(); - ISD::CondCode InverseCC = - ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32); - if (isHWTrueValue(False) && isHWFalseValue(True)) { - if (isCondCodeLegal(InverseCC, CompareVT.getSimpleVT())) { - std::swap(False, True); - CC = DAG.getCondCode(InverseCC); - } else { - ISD::CondCode SwapInvCC = ISD::getSetCCSwappedOperands(InverseCC); - if (isCondCodeLegal(SwapInvCC, CompareVT.getSimpleVT())) { - std::swap(False, True); - std::swap(LHS, RHS); - CC = DAG.getCondCode(SwapInvCC); - } - } - } - - if (isHWTrueValue(True) && isHWFalseValue(False) && - (CompareVT == VT || VT == MVT::i32)) { - // This can be matched by a SET* instruction. - return DAG.getNode(ISD::SELECT_CC, DL, VT, LHS, RHS, True, False, CC); - } - - // Try to lower to a CND* instruction: - // - // CND* can match the following patterns: - // - // select_cc f32, 0.0, f32, f32, cc_supported - // select_cc f32, 0.0, i32, i32, cc_supported - // select_cc i32, 0, f32, f32, cc_supported - // select_cc i32, 0, i32, i32, cc_supported - // - - // Try to move the zero value to the RHS - if (isZero(LHS)) { - ISD::CondCode CCOpcode = cast(CC)->get(); - // Try swapping the operands - ISD::CondCode CCSwapped = ISD::getSetCCSwappedOperands(CCOpcode); - if (isCondCodeLegal(CCSwapped, CompareVT.getSimpleVT())) { - std::swap(LHS, RHS); - CC = DAG.getCondCode(CCSwapped); - } else { - // Try inverting the conditon and then swapping the operands - ISD::CondCode CCInv = ISD::getSetCCInverse(CCOpcode, CompareVT.isInteger()); - CCSwapped = ISD::getSetCCSwappedOperands(CCInv); - if (isCondCodeLegal(CCSwapped, CompareVT.getSimpleVT())) { - std::swap(True, False); - std::swap(LHS, RHS); - CC = DAG.getCondCode(CCSwapped); - } - } - } - if (isZero(RHS)) { - SDValue Cond = LHS; - SDValue Zero = RHS; - ISD::CondCode CCOpcode = cast(CC)->get(); - if (CompareVT != VT) { - // Bitcast True / False to the correct types. This will end up being - // a nop, but it allows us to define only a single pattern in the - // .TD files for each CND* instruction rather than having to have - // one pattern for integer True/False and one for fp True/False - True = DAG.getNode(ISD::BITCAST, DL, CompareVT, True); - False = DAG.getNode(ISD::BITCAST, DL, CompareVT, False); - } - - switch (CCOpcode) { - case ISD::SETONE: - case ISD::SETUNE: - case ISD::SETNE: - CCOpcode = ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32); - Temp = True; - True = False; - False = Temp; - break; - default: - break; - } - SDValue SelectNode = DAG.getNode(ISD::SELECT_CC, DL, CompareVT, - Cond, Zero, - True, False, - DAG.getCondCode(CCOpcode)); - return DAG.getNode(ISD::BITCAST, DL, VT, SelectNode); - } - - // If we make it this for it means we have no native instructions to handle - // this SELECT_CC, so we must lower it. - SDValue HWTrue, HWFalse; - - if (CompareVT == MVT::f32) { - HWTrue = DAG.getConstantFP(1.0f, DL, CompareVT); - HWFalse = DAG.getConstantFP(0.0f, DL, CompareVT); - } else if (CompareVT == MVT::i32) { - HWTrue = DAG.getConstant(-1, DL, CompareVT); - HWFalse = DAG.getConstant(0, DL, CompareVT); - } - else { - llvm_unreachable("Unhandled value type in LowerSELECT_CC"); - } - - // Lower this unsupported SELECT_CC into a combination of two supported - // SELECT_CC operations. - SDValue Cond = DAG.getNode(ISD::SELECT_CC, DL, CompareVT, LHS, RHS, HWTrue, HWFalse, CC); - - return DAG.getNode(ISD::SELECT_CC, DL, VT, - Cond, HWFalse, - True, False, - DAG.getCondCode(ISD::SETNE)); -} - -/// LLVM generates byte-addressed pointers. For indirect addressing, we need to -/// convert these pointers to a register index. Each register holds -/// 16 bytes, (4 x 32bit sub-register), but we need to take into account the -/// \p StackWidth, which tells us how many of the 4 sub-registrers will be used -/// for indirect addressing. -SDValue R600TargetLowering::stackPtrToRegIndex(SDValue Ptr, - unsigned StackWidth, - SelectionDAG &DAG) const { - unsigned SRLPad; - switch(StackWidth) { - case 1: - SRLPad = 2; - break; - case 2: - SRLPad = 3; - break; - case 4: - SRLPad = 4; - break; - default: llvm_unreachable("Invalid stack width"); - } - - SDLoc DL(Ptr); - return DAG.getNode(ISD::SRL, DL, Ptr.getValueType(), Ptr, - DAG.getConstant(SRLPad, DL, MVT::i32)); -} - -void R600TargetLowering::getStackAddress(unsigned StackWidth, - unsigned ElemIdx, - unsigned &Channel, - unsigned &PtrIncr) const { - switch (StackWidth) { - default: - case 1: - Channel = 0; - if (ElemIdx > 0) { - PtrIncr = 1; - } else { - PtrIncr = 0; - } - break; - case 2: - Channel = ElemIdx % 2; - if (ElemIdx == 2) { - PtrIncr = 1; - } else { - PtrIncr = 0; - } - break; - case 4: - Channel = ElemIdx; - PtrIncr = 0; - break; - } -} - -SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { - SDLoc DL(Op); - StoreSDNode *StoreNode = cast(Op); - SDValue Chain = Op.getOperand(0); - SDValue Value = Op.getOperand(1); - SDValue Ptr = Op.getOperand(2); - - SDValue Result = AMDGPUTargetLowering::LowerSTORE(Op, DAG); - if (Result.getNode()) { - return Result; - } - - if (StoreNode->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS) { - if (StoreNode->isTruncatingStore()) { - EVT VT = Value.getValueType(); - assert(VT.bitsLE(MVT::i32)); - EVT MemVT = StoreNode->getMemoryVT(); - SDValue MaskConstant; - if (MemVT == MVT::i8) { - MaskConstant = DAG.getConstant(0xFF, DL, MVT::i32); - } else { - assert(MemVT == MVT::i16); - MaskConstant = DAG.getConstant(0xFFFF, DL, MVT::i32); - } - SDValue DWordAddr = DAG.getNode(ISD::SRL, DL, VT, Ptr, - DAG.getConstant(2, DL, MVT::i32)); - SDValue ByteIndex = DAG.getNode(ISD::AND, DL, Ptr.getValueType(), Ptr, - DAG.getConstant(0x00000003, DL, VT)); - SDValue TruncValue = DAG.getNode(ISD::AND, DL, VT, Value, MaskConstant); - SDValue Shift = DAG.getNode(ISD::SHL, DL, VT, ByteIndex, - DAG.getConstant(3, DL, VT)); - SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, VT, TruncValue, Shift); - SDValue Mask = DAG.getNode(ISD::SHL, DL, VT, MaskConstant, Shift); - // XXX: If we add a 64-bit ZW register class, then we could use a 2 x i32 - // vector instead. - SDValue Src[4] = { - ShiftedValue, - DAG.getConstant(0, DL, MVT::i32), - DAG.getConstant(0, DL, MVT::i32), - Mask - }; - SDValue Input = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4i32, Src); - SDValue Args[3] = { Chain, Input, DWordAddr }; - return DAG.getMemIntrinsicNode(AMDGPUISD::STORE_MSKOR, DL, - Op->getVTList(), Args, MemVT, - StoreNode->getMemOperand()); - } else if (Ptr->getOpcode() != AMDGPUISD::DWORDADDR && - Value.getValueType().bitsGE(MVT::i32)) { - // Convert pointer from byte address to dword address. - Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, Ptr.getValueType(), - DAG.getNode(ISD::SRL, DL, Ptr.getValueType(), - Ptr, DAG.getConstant(2, DL, MVT::i32))); - - if (StoreNode->isTruncatingStore() || StoreNode->isIndexed()) { - llvm_unreachable("Truncated and indexed stores not supported yet"); - } else { - Chain = DAG.getStore(Chain, DL, Value, Ptr, StoreNode->getMemOperand()); - } - return Chain; - } - } - - EVT ValueVT = Value.getValueType(); - - if (StoreNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) { - return SDValue(); - } - - SDValue Ret = AMDGPUTargetLowering::LowerSTORE(Op, DAG); - if (Ret.getNode()) { - return Ret; - } - // Lowering for indirect addressing - - const MachineFunction &MF = DAG.getMachineFunction(); - const AMDGPUFrameLowering *TFL = - static_cast(Subtarget->getFrameLowering()); - unsigned StackWidth = TFL->getStackWidth(MF); - - Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG); - - if (ValueVT.isVector()) { - unsigned NumElemVT = ValueVT.getVectorNumElements(); - EVT ElemVT = ValueVT.getVectorElementType(); - SmallVector Stores(NumElemVT); - - assert(NumElemVT >= StackWidth && "Stack width cannot be greater than " - "vector width in load"); - - for (unsigned i = 0; i < NumElemVT; ++i) { - unsigned Channel, PtrIncr; - getStackAddress(StackWidth, i, Channel, PtrIncr); - Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr, - DAG.getConstant(PtrIncr, DL, MVT::i32)); - SDValue Elem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ElemVT, - Value, DAG.getConstant(i, DL, MVT::i32)); - - Stores[i] = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, - Chain, Elem, Ptr, - DAG.getTargetConstant(Channel, DL, MVT::i32)); - } - Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores); - } else { - if (ValueVT == MVT::i8) { - Value = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Value); - } - Chain = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, Chain, Value, Ptr, - DAG.getTargetConstant(0, DL, MVT::i32)); // Channel - } - - return Chain; -} - -// return (512 + (kc_bank << 12) -static int -ConstantAddressBlock(unsigned AddressSpace) { - switch (AddressSpace) { - case AMDGPUAS::CONSTANT_BUFFER_0: - return 512; - case AMDGPUAS::CONSTANT_BUFFER_1: - return 512 + 4096; - case AMDGPUAS::CONSTANT_BUFFER_2: - return 512 + 4096 * 2; - case AMDGPUAS::CONSTANT_BUFFER_3: - return 512 + 4096 * 3; - case AMDGPUAS::CONSTANT_BUFFER_4: - return 512 + 4096 * 4; - case AMDGPUAS::CONSTANT_BUFFER_5: - return 512 + 4096 * 5; - case AMDGPUAS::CONSTANT_BUFFER_6: - return 512 + 4096 * 6; - case AMDGPUAS::CONSTANT_BUFFER_7: - return 512 + 4096 * 7; - case AMDGPUAS::CONSTANT_BUFFER_8: - return 512 + 4096 * 8; - case AMDGPUAS::CONSTANT_BUFFER_9: - return 512 + 4096 * 9; - case AMDGPUAS::CONSTANT_BUFFER_10: - return 512 + 4096 * 10; - case AMDGPUAS::CONSTANT_BUFFER_11: - return 512 + 4096 * 11; - case AMDGPUAS::CONSTANT_BUFFER_12: - return 512 + 4096 * 12; - case AMDGPUAS::CONSTANT_BUFFER_13: - return 512 + 4096 * 13; - case AMDGPUAS::CONSTANT_BUFFER_14: - return 512 + 4096 * 14; - case AMDGPUAS::CONSTANT_BUFFER_15: - return 512 + 4096 * 15; - default: - return -1; - } -} - -SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const -{ - EVT VT = Op.getValueType(); - SDLoc DL(Op); - LoadSDNode *LoadNode = cast(Op); - SDValue Chain = Op.getOperand(0); - SDValue Ptr = Op.getOperand(1); - SDValue LoweredLoad; - - SDValue Ret = AMDGPUTargetLowering::LowerLOAD(Op, DAG); - if (Ret.getNode()) { - SDValue Ops[2] = { - Ret, - Chain - }; - return DAG.getMergeValues(Ops, DL); - } - - // Lower loads constant address space global variable loads - if (LoadNode->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS && - isa(GetUnderlyingObject( - LoadNode->getMemOperand()->getValue(), *getDataLayout()))) { - - SDValue Ptr = DAG.getZExtOrTrunc(LoadNode->getBasePtr(), DL, - getPointerTy(AMDGPUAS::PRIVATE_ADDRESS)); - Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr, - DAG.getConstant(2, DL, MVT::i32)); - return DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, Op->getVTList(), - LoadNode->getChain(), Ptr, - DAG.getTargetConstant(0, DL, MVT::i32), - Op.getOperand(2)); - } - - if (LoadNode->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && VT.isVector()) { - SDValue MergedValues[2] = { - ScalarizeVectorLoad(Op, DAG), - Chain - }; - return DAG.getMergeValues(MergedValues, DL); - } - - int ConstantBlock = ConstantAddressBlock(LoadNode->getAddressSpace()); - if (ConstantBlock > -1 && - ((LoadNode->getExtensionType() == ISD::NON_EXTLOAD) || - (LoadNode->getExtensionType() == ISD::ZEXTLOAD))) { - SDValue Result; - if (isa(LoadNode->getMemOperand()->getValue()) || - isa(LoadNode->getMemOperand()->getValue()) || - isa(Ptr)) { - SDValue Slots[4]; - for (unsigned i = 0; i < 4; i++) { - // We want Const position encoded with the following formula : - // (((512 + (kc_bank << 12) + const_index) << 2) + chan) - // const_index is Ptr computed by llvm using an alignment of 16. - // Thus we add (((512 + (kc_bank << 12)) + chan ) * 4 here and - // then div by 4 at the ISel step - SDValue NewPtr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr, - DAG.getConstant(4 * i + ConstantBlock * 16, DL, MVT::i32)); - Slots[i] = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::i32, NewPtr); - } - EVT NewVT = MVT::v4i32; - unsigned NumElements = 4; - if (VT.isVector()) { - NewVT = VT; - NumElements = VT.getVectorNumElements(); - } - Result = DAG.getNode(ISD::BUILD_VECTOR, DL, NewVT, - makeArrayRef(Slots, NumElements)); - } else { - // non-constant ptr can't be folded, keeps it as a v4f32 load - Result = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::v4i32, - DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr, - DAG.getConstant(4, DL, MVT::i32)), - DAG.getConstant(LoadNode->getAddressSpace() - - AMDGPUAS::CONSTANT_BUFFER_0, DL, MVT::i32) - ); - } - - if (!VT.isVector()) { - Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Result, - DAG.getConstant(0, DL, MVT::i32)); - } - - SDValue MergedValues[2] = { - Result, - Chain - }; - return DAG.getMergeValues(MergedValues, DL); - } - - // For most operations returning SDValue() will result in the node being - // expanded by the DAG Legalizer. This is not the case for ISD::LOAD, so we - // need to manually expand loads that may be legal in some address spaces and - // illegal in others. SEXT loads from CONSTANT_BUFFER_0 are supported for - // compute shaders, since the data is sign extended when it is uploaded to the - // buffer. However SEXT loads from other address spaces are not supported, so - // we need to expand them here. - if (LoadNode->getExtensionType() == ISD::SEXTLOAD) { - EVT MemVT = LoadNode->getMemoryVT(); - assert(!MemVT.isVector() && (MemVT == MVT::i16 || MemVT == MVT::i8)); - SDValue NewLoad = DAG.getExtLoad(ISD::EXTLOAD, DL, VT, Chain, Ptr, - LoadNode->getPointerInfo(), MemVT, - LoadNode->isVolatile(), - LoadNode->isNonTemporal(), - LoadNode->isInvariant(), - LoadNode->getAlignment()); - SDValue Res = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, NewLoad, - DAG.getValueType(MemVT)); - - SDValue MergedValues[2] = { Res, Chain }; - return DAG.getMergeValues(MergedValues, DL); - } - - if (LoadNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) { - return SDValue(); - } - - // Lowering for indirect addressing - const MachineFunction &MF = DAG.getMachineFunction(); - const AMDGPUFrameLowering *TFL = - static_cast(Subtarget->getFrameLowering()); - unsigned StackWidth = TFL->getStackWidth(MF); - - Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG); - - if (VT.isVector()) { - unsigned NumElemVT = VT.getVectorNumElements(); - EVT ElemVT = VT.getVectorElementType(); - SDValue Loads[4]; - - assert(NumElemVT >= StackWidth && "Stack width cannot be greater than " - "vector width in load"); - - for (unsigned i = 0; i < NumElemVT; ++i) { - unsigned Channel, PtrIncr; - getStackAddress(StackWidth, i, Channel, PtrIncr); - Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr, - DAG.getConstant(PtrIncr, DL, MVT::i32)); - Loads[i] = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, ElemVT, - Chain, Ptr, - DAG.getTargetConstant(Channel, DL, MVT::i32), - Op.getOperand(2)); - } - for (unsigned i = NumElemVT; i < 4; ++i) { - Loads[i] = DAG.getUNDEF(ElemVT); - } - EVT TargetVT = EVT::getVectorVT(*DAG.getContext(), ElemVT, 4); - LoweredLoad = DAG.getNode(ISD::BUILD_VECTOR, DL, TargetVT, Loads); - } else { - LoweredLoad = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, VT, - Chain, Ptr, - DAG.getTargetConstant(0, DL, MVT::i32), // Channel - Op.getOperand(2)); - } - - SDValue Ops[2] = { - LoweredLoad, - Chain - }; - - return DAG.getMergeValues(Ops, DL); -} - -SDValue R600TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { - SDValue Chain = Op.getOperand(0); - SDValue Cond = Op.getOperand(1); - SDValue Jump = Op.getOperand(2); - - return DAG.getNode(AMDGPUISD::BRANCH_COND, SDLoc(Op), Op.getValueType(), - Chain, Jump, Cond); -} - -/// XXX Only kernel functions are supported, so we can assume for now that -/// every function is a kernel function, but in the future we should use -/// separate calling conventions for kernel and non-kernel functions. -SDValue R600TargetLowering::LowerFormalArguments( - SDValue Chain, - CallingConv::ID CallConv, - bool isVarArg, - const SmallVectorImpl &Ins, - SDLoc DL, SelectionDAG &DAG, - SmallVectorImpl &InVals) const { - SmallVector ArgLocs; - CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs, - *DAG.getContext()); - MachineFunction &MF = DAG.getMachineFunction(); - R600MachineFunctionInfo *MFI = MF.getInfo(); - - SmallVector LocalIns; - - getOriginalFunctionArgs(DAG, MF.getFunction(), Ins, LocalIns); - - AnalyzeFormalArguments(CCInfo, LocalIns); - - for (unsigned i = 0, e = Ins.size(); i < e; ++i) { - CCValAssign &VA = ArgLocs[i]; - const ISD::InputArg &In = Ins[i]; - EVT VT = In.VT; - EVT MemVT = VA.getLocVT(); - if (!VT.isVector() && MemVT.isVector()) { - // Get load source type if scalarized. - MemVT = MemVT.getVectorElementType(); - } - - if (MFI->getShaderType() != ShaderType::COMPUTE) { - unsigned Reg = MF.addLiveIn(VA.getLocReg(), &AMDGPU::R600_Reg128RegClass); - SDValue Register = DAG.getCopyFromReg(Chain, DL, Reg, VT); - InVals.push_back(Register); - continue; - } - - PointerType *PtrTy = PointerType::get(VT.getTypeForEVT(*DAG.getContext()), - AMDGPUAS::CONSTANT_BUFFER_0); - - // i64 isn't a legal type, so the register type used ends up as i32, which - // isn't expected here. It attempts to create this sextload, but it ends up - // being invalid. Somehow this seems to work with i64 arguments, but breaks - // for <1 x i64>. - - // The first 36 bytes of the input buffer contains information about - // thread group and global sizes. - ISD::LoadExtType Ext = ISD::NON_EXTLOAD; - if (MemVT.getScalarSizeInBits() != VT.getScalarSizeInBits()) { - // FIXME: This should really check the extload type, but the handling of - // extload vector parameters seems to be broken. - - // Ext = In.Flags.isSExt() ? ISD::SEXTLOAD : ISD::ZEXTLOAD; - Ext = ISD::SEXTLOAD; - } - - // Compute the offset from the value. - // XXX - I think PartOffset should give you this, but it seems to give the - // size of the register which isn't useful. - - unsigned ValBase = ArgLocs[In.getOrigArgIndex()].getLocMemOffset(); - unsigned PartOffset = VA.getLocMemOffset(); - unsigned Offset = 36 + VA.getLocMemOffset(); - - MachinePointerInfo PtrInfo(UndefValue::get(PtrTy), PartOffset - ValBase); - SDValue Arg = DAG.getLoad(ISD::UNINDEXED, Ext, VT, DL, Chain, - DAG.getConstant(Offset, DL, MVT::i32), - DAG.getUNDEF(MVT::i32), - PtrInfo, - MemVT, false, true, true, 4); - - // 4 is the preferred alignment for the CONSTANT memory space. - InVals.push_back(Arg); - MFI->ABIArgOffset = Offset + MemVT.getStoreSize(); - } - return Chain; -} - -EVT R600TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const { - if (!VT.isVector()) - return MVT::i32; - return VT.changeVectorElementTypeToInteger(); -} - -static SDValue CompactSwizzlableVector( - SelectionDAG &DAG, SDValue VectorEntry, - DenseMap &RemapSwizzle) { - assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR); - assert(RemapSwizzle.empty()); - SDValue NewBldVec[4] = { - VectorEntry.getOperand(0), - VectorEntry.getOperand(1), - VectorEntry.getOperand(2), - VectorEntry.getOperand(3) - }; - - for (unsigned i = 0; i < 4; i++) { - if (NewBldVec[i].getOpcode() == ISD::UNDEF) - // We mask write here to teach later passes that the ith element of this - // vector is undef. Thus we can use it to reduce 128 bits reg usage, - // break false dependencies and additionnaly make assembly easier to read. - RemapSwizzle[i] = 7; // SEL_MASK_WRITE - if (ConstantFPSDNode *C = dyn_cast(NewBldVec[i])) { - if (C->isZero()) { - RemapSwizzle[i] = 4; // SEL_0 - NewBldVec[i] = DAG.getUNDEF(MVT::f32); - } else if (C->isExactlyValue(1.0)) { - RemapSwizzle[i] = 5; // SEL_1 - NewBldVec[i] = DAG.getUNDEF(MVT::f32); - } - } - - if (NewBldVec[i].getOpcode() == ISD::UNDEF) - continue; - for (unsigned j = 0; j < i; j++) { - if (NewBldVec[i] == NewBldVec[j]) { - NewBldVec[i] = DAG.getUNDEF(NewBldVec[i].getValueType()); - RemapSwizzle[i] = j; - break; - } - } - } - - return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(VectorEntry), - VectorEntry.getValueType(), NewBldVec); -} - -static SDValue ReorganizeVector(SelectionDAG &DAG, SDValue VectorEntry, - DenseMap &RemapSwizzle) { - assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR); - assert(RemapSwizzle.empty()); - SDValue NewBldVec[4] = { - VectorEntry.getOperand(0), - VectorEntry.getOperand(1), - VectorEntry.getOperand(2), - VectorEntry.getOperand(3) - }; - bool isUnmovable[4] = { false, false, false, false }; - for (unsigned i = 0; i < 4; i++) { - RemapSwizzle[i] = i; - if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) { - unsigned Idx = dyn_cast(NewBldVec[i].getOperand(1)) - ->getZExtValue(); - if (i == Idx) - isUnmovable[Idx] = true; - } - } - - for (unsigned i = 0; i < 4; i++) { - if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) { - unsigned Idx = dyn_cast(NewBldVec[i].getOperand(1)) - ->getZExtValue(); - if (isUnmovable[Idx]) - continue; - // Swap i and Idx - std::swap(NewBldVec[Idx], NewBldVec[i]); - std::swap(RemapSwizzle[i], RemapSwizzle[Idx]); - break; - } - } - - return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(VectorEntry), - VectorEntry.getValueType(), NewBldVec); -} - - -SDValue R600TargetLowering::OptimizeSwizzle(SDValue BuildVector, - SDValue Swz[4], SelectionDAG &DAG, - SDLoc DL) const { - assert(BuildVector.getOpcode() == ISD::BUILD_VECTOR); - // Old -> New swizzle values - DenseMap SwizzleRemap; - - BuildVector = CompactSwizzlableVector(DAG, BuildVector, SwizzleRemap); - for (unsigned i = 0; i < 4; i++) { - unsigned Idx = cast(Swz[i])->getZExtValue(); - if (SwizzleRemap.find(Idx) != SwizzleRemap.end()) - Swz[i] = DAG.getConstant(SwizzleRemap[Idx], DL, MVT::i32); - } - - SwizzleRemap.clear(); - BuildVector = ReorganizeVector(DAG, BuildVector, SwizzleRemap); - for (unsigned i = 0; i < 4; i++) { - unsigned Idx = cast(Swz[i])->getZExtValue(); - if (SwizzleRemap.find(Idx) != SwizzleRemap.end()) - Swz[i] = DAG.getConstant(SwizzleRemap[Idx], DL, MVT::i32); - } - - return BuildVector; -} - - -//===----------------------------------------------------------------------===// -// Custom DAG Optimizations -//===----------------------------------------------------------------------===// - -SDValue R600TargetLowering::PerformDAGCombine(SDNode *N, - DAGCombinerInfo &DCI) const { - SelectionDAG &DAG = DCI.DAG; - - switch (N->getOpcode()) { - default: return AMDGPUTargetLowering::PerformDAGCombine(N, DCI); - // (f32 fp_round (f64 uint_to_fp a)) -> (f32 uint_to_fp a) - case ISD::FP_ROUND: { - SDValue Arg = N->getOperand(0); - if (Arg.getOpcode() == ISD::UINT_TO_FP && Arg.getValueType() == MVT::f64) { - return DAG.getNode(ISD::UINT_TO_FP, SDLoc(N), N->getValueType(0), - Arg.getOperand(0)); - } - break; - } - - // (i32 fp_to_sint (fneg (select_cc f32, f32, 1.0, 0.0 cc))) -> - // (i32 select_cc f32, f32, -1, 0 cc) - // - // Mesa's GLSL frontend generates the above pattern a lot and we can lower - // this to one of the SET*_DX10 instructions. - case ISD::FP_TO_SINT: { - SDValue FNeg = N->getOperand(0); - if (FNeg.getOpcode() != ISD::FNEG) { - return SDValue(); - } - SDValue SelectCC = FNeg.getOperand(0); - if (SelectCC.getOpcode() != ISD::SELECT_CC || - SelectCC.getOperand(0).getValueType() != MVT::f32 || // LHS - SelectCC.getOperand(2).getValueType() != MVT::f32 || // True - !isHWTrueValue(SelectCC.getOperand(2)) || - !isHWFalseValue(SelectCC.getOperand(3))) { - return SDValue(); - } - - SDLoc dl(N); - return DAG.getNode(ISD::SELECT_CC, dl, N->getValueType(0), - SelectCC.getOperand(0), // LHS - SelectCC.getOperand(1), // RHS - DAG.getConstant(-1, dl, MVT::i32), // True - DAG.getConstant(0, dl, MVT::i32), // False - SelectCC.getOperand(4)); // CC - - break; - } - - // insert_vector_elt (build_vector elt0, ... , eltN), NewEltIdx, idx - // => build_vector elt0, ... , NewEltIdx, ... , eltN - case ISD::INSERT_VECTOR_ELT: { - SDValue InVec = N->getOperand(0); - SDValue InVal = N->getOperand(1); - SDValue EltNo = N->getOperand(2); - SDLoc dl(N); - - // If the inserted element is an UNDEF, just use the input vector. - if (InVal.getOpcode() == ISD::UNDEF) - return InVec; - - EVT VT = InVec.getValueType(); - - // If we can't generate a legal BUILD_VECTOR, exit - if (!isOperationLegal(ISD::BUILD_VECTOR, VT)) - return SDValue(); - - // Check that we know which element is being inserted - if (!isa(EltNo)) - return SDValue(); - unsigned Elt = cast(EltNo)->getZExtValue(); - - // Check that the operand is a BUILD_VECTOR (or UNDEF, which can essentially - // be converted to a BUILD_VECTOR). Fill in the Ops vector with the - // vector elements. - SmallVector Ops; - if (InVec.getOpcode() == ISD::BUILD_VECTOR) { - Ops.append(InVec.getNode()->op_begin(), - InVec.getNode()->op_end()); - } else if (InVec.getOpcode() == ISD::UNDEF) { - unsigned NElts = VT.getVectorNumElements(); - Ops.append(NElts, DAG.getUNDEF(InVal.getValueType())); - } else { - return SDValue(); - } - - // Insert the element - if (Elt < Ops.size()) { - // All the operands of BUILD_VECTOR must have the same type; - // we enforce that here. - EVT OpVT = Ops[0].getValueType(); - if (InVal.getValueType() != OpVT) - InVal = OpVT.bitsGT(InVal.getValueType()) ? - DAG.getNode(ISD::ANY_EXTEND, dl, OpVT, InVal) : - DAG.getNode(ISD::TRUNCATE, dl, OpVT, InVal); - Ops[Elt] = InVal; - } - - // Return the new vector - return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops); - } - - // Extract_vec (Build_vector) generated by custom lowering - // also needs to be customly combined - case ISD::EXTRACT_VECTOR_ELT: { - SDValue Arg = N->getOperand(0); - if (Arg.getOpcode() == ISD::BUILD_VECTOR) { - if (ConstantSDNode *Const = dyn_cast(N->getOperand(1))) { - unsigned Element = Const->getZExtValue(); - return Arg->getOperand(Element); - } - } - if (Arg.getOpcode() == ISD::BITCAST && - Arg.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) { - if (ConstantSDNode *Const = dyn_cast(N->getOperand(1))) { - unsigned Element = Const->getZExtValue(); - return DAG.getNode(ISD::BITCAST, SDLoc(N), N->getVTList(), - Arg->getOperand(0).getOperand(Element)); - } - } - } - - case ISD::SELECT_CC: { - // Try common optimizations - SDValue Ret = AMDGPUTargetLowering::PerformDAGCombine(N, DCI); - if (Ret.getNode()) - return Ret; - - // fold selectcc (selectcc x, y, a, b, cc), b, a, b, seteq -> - // selectcc x, y, a, b, inv(cc) - // - // fold selectcc (selectcc x, y, a, b, cc), b, a, b, setne -> - // selectcc x, y, a, b, cc - SDValue LHS = N->getOperand(0); - if (LHS.getOpcode() != ISD::SELECT_CC) { - return SDValue(); - } - - SDValue RHS = N->getOperand(1); - SDValue True = N->getOperand(2); - SDValue False = N->getOperand(3); - ISD::CondCode NCC = cast(N->getOperand(4))->get(); - - if (LHS.getOperand(2).getNode() != True.getNode() || - LHS.getOperand(3).getNode() != False.getNode() || - RHS.getNode() != False.getNode()) { - return SDValue(); - } - - switch (NCC) { - default: return SDValue(); - case ISD::SETNE: return LHS; - case ISD::SETEQ: { - ISD::CondCode LHSCC = cast(LHS.getOperand(4))->get(); - LHSCC = ISD::getSetCCInverse(LHSCC, - LHS.getOperand(0).getValueType().isInteger()); - if (DCI.isBeforeLegalizeOps() || - isCondCodeLegal(LHSCC, LHS.getOperand(0).getSimpleValueType())) - return DAG.getSelectCC(SDLoc(N), - LHS.getOperand(0), - LHS.getOperand(1), - LHS.getOperand(2), - LHS.getOperand(3), - LHSCC); - break; - } - } - return SDValue(); - } - - case AMDGPUISD::EXPORT: { - SDValue Arg = N->getOperand(1); - if (Arg.getOpcode() != ISD::BUILD_VECTOR) - break; - - SDValue NewArgs[8] = { - N->getOperand(0), // Chain - SDValue(), - N->getOperand(2), // ArrayBase - N->getOperand(3), // Type - N->getOperand(4), // SWZ_X - N->getOperand(5), // SWZ_Y - N->getOperand(6), // SWZ_Z - N->getOperand(7) // SWZ_W - }; - SDLoc DL(N); - NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[4], DAG, DL); - return DAG.getNode(AMDGPUISD::EXPORT, DL, N->getVTList(), NewArgs); - } - case AMDGPUISD::TEXTURE_FETCH: { - SDValue Arg = N->getOperand(1); - if (Arg.getOpcode() != ISD::BUILD_VECTOR) - break; - - SDValue NewArgs[19] = { - N->getOperand(0), - N->getOperand(1), - N->getOperand(2), - N->getOperand(3), - N->getOperand(4), - N->getOperand(5), - N->getOperand(6), - N->getOperand(7), - N->getOperand(8), - N->getOperand(9), - N->getOperand(10), - N->getOperand(11), - N->getOperand(12), - N->getOperand(13), - N->getOperand(14), - N->getOperand(15), - N->getOperand(16), - N->getOperand(17), - N->getOperand(18), - }; - SDLoc DL(N); - NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[2], DAG, DL); - return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, DL, N->getVTList(), NewArgs); - } - } - - return AMDGPUTargetLowering::PerformDAGCombine(N, DCI); -} - -static bool -FoldOperand(SDNode *ParentNode, unsigned SrcIdx, SDValue &Src, SDValue &Neg, - SDValue &Abs, SDValue &Sel, SDValue &Imm, SelectionDAG &DAG) { - const R600InstrInfo *TII = - static_cast(DAG.getSubtarget().getInstrInfo()); - if (!Src.isMachineOpcode()) - return false; - switch (Src.getMachineOpcode()) { - case AMDGPU::FNEG_R600: - if (!Neg.getNode()) - return false; - Src = Src.getOperand(0); - Neg = DAG.getTargetConstant(1, SDLoc(ParentNode), MVT::i32); - return true; - case AMDGPU::FABS_R600: - if (!Abs.getNode()) - return false; - Src = Src.getOperand(0); - Abs = DAG.getTargetConstant(1, SDLoc(ParentNode), MVT::i32); - return true; - case AMDGPU::CONST_COPY: { - unsigned Opcode = ParentNode->getMachineOpcode(); - bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1; - - if (!Sel.getNode()) - return false; - - SDValue CstOffset = Src.getOperand(0); - if (ParentNode->getValueType(0).isVector()) - return false; - - // Gather constants values - int SrcIndices[] = { - TII->getOperandIdx(Opcode, AMDGPU::OpName::src0), - TII->getOperandIdx(Opcode, AMDGPU::OpName::src1), - TII->getOperandIdx(Opcode, AMDGPU::OpName::src2), - TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_X), - TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Y), - TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Z), - TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_W), - TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_X), - TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Y), - TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Z), - TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_W) - }; - std::vector Consts; - for (int OtherSrcIdx : SrcIndices) { - int OtherSelIdx = TII->getSelIdx(Opcode, OtherSrcIdx); - if (OtherSrcIdx < 0 || OtherSelIdx < 0) - continue; - if (HasDst) { - OtherSrcIdx--; - OtherSelIdx--; - } - if (RegisterSDNode *Reg = - dyn_cast(ParentNode->getOperand(OtherSrcIdx))) { - if (Reg->getReg() == AMDGPU::ALU_CONST) { - ConstantSDNode *Cst - = cast(ParentNode->getOperand(OtherSelIdx)); - Consts.push_back(Cst->getZExtValue()); - } - } - } - - ConstantSDNode *Cst = cast(CstOffset); - Consts.push_back(Cst->getZExtValue()); - if (!TII->fitsConstReadLimitations(Consts)) { - return false; - } - - Sel = CstOffset; - Src = DAG.getRegister(AMDGPU::ALU_CONST, MVT::f32); - return true; - } - case AMDGPU::MOV_IMM_I32: - case AMDGPU::MOV_IMM_F32: { - unsigned ImmReg = AMDGPU::ALU_LITERAL_X; - uint64_t ImmValue = 0; - - - if (Src.getMachineOpcode() == AMDGPU::MOV_IMM_F32) { - ConstantFPSDNode *FPC = dyn_cast(Src.getOperand(0)); - float FloatValue = FPC->getValueAPF().convertToFloat(); - if (FloatValue == 0.0) { - ImmReg = AMDGPU::ZERO; - } else if (FloatValue == 0.5) { - ImmReg = AMDGPU::HALF; - } else if (FloatValue == 1.0) { - ImmReg = AMDGPU::ONE; - } else { - ImmValue = FPC->getValueAPF().bitcastToAPInt().getZExtValue(); - } - } else { - ConstantSDNode *C = dyn_cast(Src.getOperand(0)); - uint64_t Value = C->getZExtValue(); - if (Value == 0) { - ImmReg = AMDGPU::ZERO; - } else if (Value == 1) { - ImmReg = AMDGPU::ONE_INT; - } else { - ImmValue = Value; - } - } - - // Check that we aren't already using an immediate. - // XXX: It's possible for an instruction to have more than one - // immediate operand, but this is not supported yet. - if (ImmReg == AMDGPU::ALU_LITERAL_X) { - if (!Imm.getNode()) - return false; - ConstantSDNode *C = dyn_cast(Imm); - assert(C); - if (C->getZExtValue()) - return false; - Imm = DAG.getTargetConstant(ImmValue, SDLoc(ParentNode), MVT::i32); - } - Src = DAG.getRegister(ImmReg, MVT::i32); - return true; - } - default: - return false; - } -} - - -/// \brief Fold the instructions after selecting them -SDNode *R600TargetLowering::PostISelFolding(MachineSDNode *Node, - SelectionDAG &DAG) const { - const R600InstrInfo *TII = - static_cast(DAG.getSubtarget().getInstrInfo()); - if (!Node->isMachineOpcode()) - return Node; - unsigned Opcode = Node->getMachineOpcode(); - SDValue FakeOp; - - std::vector Ops(Node->op_begin(), Node->op_end()); - - if (Opcode == AMDGPU::DOT_4) { - int OperandIdx[] = { - TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_X), - TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Y), - TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Z), - TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_W), - TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_X), - TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Y), - TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Z), - TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_W) - }; - int NegIdx[] = { - TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_X), - TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_Y), - TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_Z), - TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_W), - TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_X), - TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_Y), - TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_Z), - TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_W) - }; - int AbsIdx[] = { - TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_X), - TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_Y), - TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_Z), - TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_W), - TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_X), - TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_Y), - TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_Z), - TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_W) - }; - for (unsigned i = 0; i < 8; i++) { - if (OperandIdx[i] < 0) - return Node; - SDValue &Src = Ops[OperandIdx[i] - 1]; - SDValue &Neg = Ops[NegIdx[i] - 1]; - SDValue &Abs = Ops[AbsIdx[i] - 1]; - bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1; - int SelIdx = TII->getSelIdx(Opcode, OperandIdx[i]); - if (HasDst) - SelIdx--; - SDValue &Sel = (SelIdx > -1) ? Ops[SelIdx] : FakeOp; - if (FoldOperand(Node, i, Src, Neg, Abs, Sel, FakeOp, DAG)) - return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops); - } - } else if (Opcode == AMDGPU::REG_SEQUENCE) { - for (unsigned i = 1, e = Node->getNumOperands(); i < e; i += 2) { - SDValue &Src = Ops[i]; - if (FoldOperand(Node, i, Src, FakeOp, FakeOp, FakeOp, FakeOp, DAG)) - return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops); - } - } else if (Opcode == AMDGPU::CLAMP_R600) { - SDValue Src = Node->getOperand(0); - if (!Src.isMachineOpcode() || - !TII->hasInstrModifiers(Src.getMachineOpcode())) - return Node; - int ClampIdx = TII->getOperandIdx(Src.getMachineOpcode(), - AMDGPU::OpName::clamp); - if (ClampIdx < 0) - return Node; - SDLoc DL(Node); - std::vector Ops(Src->op_begin(), Src->op_end()); - Ops[ClampIdx - 1] = DAG.getTargetConstant(1, DL, MVT::i32); - return DAG.getMachineNode(Src.getMachineOpcode(), DL, - Node->getVTList(), Ops); - } else { - if (!TII->hasInstrModifiers(Opcode)) - return Node; - int OperandIdx[] = { - TII->getOperandIdx(Opcode, AMDGPU::OpName::src0), - TII->getOperandIdx(Opcode, AMDGPU::OpName::src1), - TII->getOperandIdx(Opcode, AMDGPU::OpName::src2) - }; - int NegIdx[] = { - TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg), - TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg), - TII->getOperandIdx(Opcode, AMDGPU::OpName::src2_neg) - }; - int AbsIdx[] = { - TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs), - TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs), - -1 - }; - for (unsigned i = 0; i < 3; i++) { - if (OperandIdx[i] < 0) - return Node; - SDValue &Src = Ops[OperandIdx[i] - 1]; - SDValue &Neg = Ops[NegIdx[i] - 1]; - SDValue FakeAbs; - SDValue &Abs = (AbsIdx[i] > -1) ? Ops[AbsIdx[i] - 1] : FakeAbs; - bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1; - int SelIdx = TII->getSelIdx(Opcode, OperandIdx[i]); - int ImmIdx = TII->getOperandIdx(Opcode, AMDGPU::OpName::literal); - if (HasDst) { - SelIdx--; - ImmIdx--; - } - SDValue &Sel = (SelIdx > -1) ? Ops[SelIdx] : FakeOp; - SDValue &Imm = Ops[ImmIdx]; - if (FoldOperand(Node, i, Src, Neg, Abs, Sel, Imm, DAG)) - return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops); - } - } - - return Node; -} Index: projects/clang370-import/contrib/llvm/lib/Target/R600/AMDILCFGStructurizer.cpp =================================================================== --- projects/clang370-import/contrib/llvm/lib/Target/R600/AMDILCFGStructurizer.cpp (revision 288077) +++ projects/clang370-import/contrib/llvm/lib/Target/R600/AMDILCFGStructurizer.cpp (nonexistent) @@ -1,1912 +0,0 @@ -//===-- AMDILCFGStructurizer.cpp - CFG Structurizer -----------------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -/// \file -//==-----------------------------------------------------------------------===// - -#include "AMDGPU.h" -#include "AMDGPUInstrInfo.h" -#include "AMDGPUSubtarget.h" -#include "R600InstrInfo.h" -#include "llvm/ADT/DepthFirstIterator.h" -#include "llvm/ADT/SCCIterator.h" -#include "llvm/ADT/SmallVector.h" -#include "llvm/ADT/Statistic.h" -#include "llvm/CodeGen/MachineDominators.h" -#include "llvm/CodeGen/MachineFunction.h" -#include "llvm/CodeGen/MachineFunctionAnalysis.h" -#include "llvm/CodeGen/MachineFunctionPass.h" -#include "llvm/CodeGen/MachineInstrBuilder.h" -#include "llvm/CodeGen/MachineJumpTableInfo.h" -#include "llvm/CodeGen/MachineLoopInfo.h" -#include "llvm/CodeGen/MachinePostDominators.h" -#include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/IR/Dominators.h" -#include "llvm/Support/Debug.h" -#include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetInstrInfo.h" -#include "llvm/Target/TargetMachine.h" -#include - -using namespace llvm; - -#define DEBUG_TYPE "structcfg" - -#define DEFAULT_VEC_SLOTS 8 - -// TODO: move-begin. - -//===----------------------------------------------------------------------===// -// -// Statistics for CFGStructurizer. -// -//===----------------------------------------------------------------------===// - -STATISTIC(numSerialPatternMatch, "CFGStructurizer number of serial pattern " - "matched"); -STATISTIC(numIfPatternMatch, "CFGStructurizer number of if pattern " - "matched"); -STATISTIC(numLoopcontPatternMatch, "CFGStructurizer number of loop-continue " - "pattern matched"); -STATISTIC(numClonedBlock, "CFGStructurizer cloned blocks"); -STATISTIC(numClonedInstr, "CFGStructurizer cloned instructions"); - -namespace llvm { - void initializeAMDGPUCFGStructurizerPass(PassRegistry&); -} - -//===----------------------------------------------------------------------===// -// -// Miscellaneous utility for CFGStructurizer. -// -//===----------------------------------------------------------------------===// -namespace { -#define SHOWNEWINSTR(i) \ - DEBUG(dbgs() << "New instr: " << *i << "\n"); - -#define SHOWNEWBLK(b, msg) \ -DEBUG( \ - dbgs() << msg << "BB" << b->getNumber() << "size " << b->size(); \ - dbgs() << "\n"; \ -); - -#define SHOWBLK_DETAIL(b, msg) \ -DEBUG( \ - if (b) { \ - dbgs() << msg << "BB" << b->getNumber() << "size " << b->size(); \ - b->print(dbgs()); \ - dbgs() << "\n"; \ - } \ -); - -#define INVALIDSCCNUM -1 - -template -void ReverseVector(SmallVectorImpl &Src) { - size_t sz = Src.size(); - for (size_t i = 0; i < sz/2; ++i) { - NodeT *t = Src[i]; - Src[i] = Src[sz - i - 1]; - Src[sz - i - 1] = t; - } -} - -} // end anonymous namespace - -//===----------------------------------------------------------------------===// -// -// supporting data structure for CFGStructurizer -// -//===----------------------------------------------------------------------===// - - -namespace { - -class BlockInformation { -public: - bool IsRetired; - int SccNum; - BlockInformation() : IsRetired(false), SccNum(INVALIDSCCNUM) {} -}; - -} // end anonymous namespace - -//===----------------------------------------------------------------------===// -// -// CFGStructurizer -// -//===----------------------------------------------------------------------===// - -namespace { -class AMDGPUCFGStructurizer : public MachineFunctionPass { -public: - typedef SmallVector MBBVector; - typedef std::map MBBInfoMap; - typedef std::map LoopLandInfoMap; - - enum PathToKind { - Not_SinglePath = 0, - SinglePath_InPath = 1, - SinglePath_NotInPath = 2 - }; - - static char ID; - - AMDGPUCFGStructurizer() : - MachineFunctionPass(ID), TII(nullptr), TRI(nullptr) { - initializeAMDGPUCFGStructurizerPass(*PassRegistry::getPassRegistry()); - } - - const char *getPassName() const override { - return "AMDGPU Control Flow Graph structurizer Pass"; - } - - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addPreserved(); - AU.addRequired(); - AU.addRequired(); - AU.addRequired(); - AU.addRequired(); - } - - /// Perform the CFG structurization - bool run(); - - /// Perform the CFG preparation - /// This step will remove every unconditionnal/dead jump instructions and make - /// sure all loops have an exit block - bool prepare(); - - bool runOnMachineFunction(MachineFunction &MF) override { - TII = static_cast(MF.getSubtarget().getInstrInfo()); - TRI = &TII->getRegisterInfo(); - DEBUG(MF.dump();); - OrderedBlks.clear(); - Visited.clear(); - FuncRep = &MF; - MLI = &getAnalysis(); - DEBUG(dbgs() << "LoopInfo:\n"; PrintLoopinfo(*MLI);); - MDT = &getAnalysis(); - DEBUG(MDT->print(dbgs(), (const llvm::Module*)nullptr);); - PDT = &getAnalysis(); - DEBUG(PDT->print(dbgs());); - prepare(); - run(); - DEBUG(MF.dump();); - return true; - } - -protected: - MachineDominatorTree *MDT; - MachinePostDominatorTree *PDT; - MachineLoopInfo *MLI; - const R600InstrInfo *TII; - const AMDGPURegisterInfo *TRI; - - // PRINT FUNCTIONS - /// Print the ordered Blocks. - void printOrderedBlocks() const { - size_t i = 0; - for (MBBVector::const_iterator iterBlk = OrderedBlks.begin(), - iterBlkEnd = OrderedBlks.end(); iterBlk != iterBlkEnd; ++iterBlk, ++i) { - dbgs() << "BB" << (*iterBlk)->getNumber(); - dbgs() << "(" << getSCCNum(*iterBlk) << "," << (*iterBlk)->size() << ")"; - if (i != 0 && i % 10 == 0) { - dbgs() << "\n"; - } else { - dbgs() << " "; - } - } - } - static void PrintLoopinfo(const MachineLoopInfo &LoopInfo) { - for (MachineLoop::iterator iter = LoopInfo.begin(), - iterEnd = LoopInfo.end(); iter != iterEnd; ++iter) { - (*iter)->print(dbgs(), 0); - } - } - - // UTILITY FUNCTIONS - int getSCCNum(MachineBasicBlock *MBB) const; - MachineBasicBlock *getLoopLandInfo(MachineLoop *LoopRep) const; - bool hasBackEdge(MachineBasicBlock *MBB) const; - static unsigned getLoopDepth(MachineLoop *LoopRep); - bool isRetiredBlock(MachineBasicBlock *MBB) const; - bool isActiveLoophead(MachineBasicBlock *MBB) const; - PathToKind singlePathTo(MachineBasicBlock *SrcMBB, MachineBasicBlock *DstMBB, - bool AllowSideEntry = true) const; - int countActiveBlock(MBBVector::const_iterator It, - MBBVector::const_iterator E) const; - bool needMigrateBlock(MachineBasicBlock *MBB) const; - - // Utility Functions - void reversePredicateSetter(MachineBasicBlock::iterator I); - /// Compute the reversed DFS post order of Blocks - void orderBlocks(MachineFunction *MF); - - // Function originally from CFGStructTraits - void insertInstrEnd(MachineBasicBlock *MBB, int NewOpcode, - DebugLoc DL = DebugLoc()); - MachineInstr *insertInstrBefore(MachineBasicBlock *MBB, int NewOpcode, - DebugLoc DL = DebugLoc()); - MachineInstr *insertInstrBefore(MachineBasicBlock::iterator I, int NewOpcode); - void insertCondBranchBefore(MachineBasicBlock::iterator I, int NewOpcode, - DebugLoc DL); - void insertCondBranchBefore(MachineBasicBlock *MBB, - MachineBasicBlock::iterator I, int NewOpcode, int RegNum, - DebugLoc DL); - void insertCondBranchEnd(MachineBasicBlock *MBB, int NewOpcode, int RegNum); - static int getBranchNzeroOpcode(int OldOpcode); - static int getBranchZeroOpcode(int OldOpcode); - static int getContinueNzeroOpcode(int OldOpcode); - static int getContinueZeroOpcode(int OldOpcode); - static MachineBasicBlock *getTrueBranch(MachineInstr *MI); - static void setTrueBranch(MachineInstr *MI, MachineBasicBlock *MBB); - static MachineBasicBlock *getFalseBranch(MachineBasicBlock *MBB, - MachineInstr *MI); - static bool isCondBranch(MachineInstr *MI); - static bool isUncondBranch(MachineInstr *MI); - static DebugLoc getLastDebugLocInBB(MachineBasicBlock *MBB); - static MachineInstr *getNormalBlockBranchInstr(MachineBasicBlock *MBB); - /// The correct naming for this is getPossibleLoopendBlockBranchInstr. - /// - /// BB with backward-edge could have move instructions after the branch - /// instruction. Such move instruction "belong to" the loop backward-edge. - MachineInstr *getLoopendBlockBranchInstr(MachineBasicBlock *MBB); - static MachineInstr *getReturnInstr(MachineBasicBlock *MBB); - static MachineInstr *getContinueInstr(MachineBasicBlock *MBB); - static bool isReturnBlock(MachineBasicBlock *MBB); - static void cloneSuccessorList(MachineBasicBlock *DstMBB, - MachineBasicBlock *SrcMBB) ; - static MachineBasicBlock *clone(MachineBasicBlock *MBB); - /// MachineBasicBlock::ReplaceUsesOfBlockWith doesn't serve the purpose - /// because the AMDGPU instruction is not recognized as terminator fix this - /// and retire this routine - void replaceInstrUseOfBlockWith(MachineBasicBlock *SrcMBB, - MachineBasicBlock *OldMBB, MachineBasicBlock *NewBlk); - static void wrapup(MachineBasicBlock *MBB); - - - int patternMatch(MachineBasicBlock *MBB); - int patternMatchGroup(MachineBasicBlock *MBB); - int serialPatternMatch(MachineBasicBlock *MBB); - int ifPatternMatch(MachineBasicBlock *MBB); - int loopendPatternMatch(); - int mergeLoop(MachineLoop *LoopRep); - int loopcontPatternMatch(MachineLoop *LoopRep, MachineBasicBlock *LoopHeader); - - void handleLoopcontBlock(MachineBasicBlock *ContingMBB, - MachineLoop *ContingLoop, MachineBasicBlock *ContMBB, - MachineLoop *ContLoop); - /// return true iff src1Blk->succ_size() == 0 && src1Blk and src2Blk are in - /// the same loop with LoopLandInfo without explicitly keeping track of - /// loopContBlks and loopBreakBlks, this is a method to get the information. - bool isSameloopDetachedContbreak(MachineBasicBlock *Src1MBB, - MachineBasicBlock *Src2MBB); - int handleJumpintoIf(MachineBasicBlock *HeadMBB, - MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB); - int handleJumpintoIfImp(MachineBasicBlock *HeadMBB, - MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB); - int improveSimpleJumpintoIf(MachineBasicBlock *HeadMBB, - MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB, - MachineBasicBlock **LandMBBPtr); - void showImproveSimpleJumpintoIf(MachineBasicBlock *HeadMBB, - MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB, - MachineBasicBlock *LandMBB, bool Detail = false); - int cloneOnSideEntryTo(MachineBasicBlock *PreMBB, - MachineBasicBlock *SrcMBB, MachineBasicBlock *DstMBB); - void mergeSerialBlock(MachineBasicBlock *DstMBB, - MachineBasicBlock *SrcMBB); - - void mergeIfthenelseBlock(MachineInstr *BranchMI, - MachineBasicBlock *MBB, MachineBasicBlock *TrueMBB, - MachineBasicBlock *FalseMBB, MachineBasicBlock *LandMBB); - void mergeLooplandBlock(MachineBasicBlock *DstMBB, - MachineBasicBlock *LandMBB); - void mergeLoopbreakBlock(MachineBasicBlock *ExitingMBB, - MachineBasicBlock *LandMBB); - void settleLoopcontBlock(MachineBasicBlock *ContingMBB, - MachineBasicBlock *ContMBB); - /// normalizeInfiniteLoopExit change - /// B1: - /// uncond_br LoopHeader - /// - /// to - /// B1: - /// cond_br 1 LoopHeader dummyExit - /// and return the newly added dummy exit block - MachineBasicBlock *normalizeInfiniteLoopExit(MachineLoop *LoopRep); - void removeUnconditionalBranch(MachineBasicBlock *MBB); - /// Remove duplicate branches instructions in a block. - /// For instance - /// B0: - /// cond_br X B1 B2 - /// cond_br X B1 B2 - /// is transformed to - /// B0: - /// cond_br X B1 B2 - void removeRedundantConditionalBranch(MachineBasicBlock *MBB); - void addDummyExitBlock(SmallVectorImpl &RetMBB); - void removeSuccessor(MachineBasicBlock *MBB); - MachineBasicBlock *cloneBlockForPredecessor(MachineBasicBlock *MBB, - MachineBasicBlock *PredMBB); - void migrateInstruction(MachineBasicBlock *SrcMBB, - MachineBasicBlock *DstMBB, MachineBasicBlock::iterator I); - void recordSccnum(MachineBasicBlock *MBB, int SCCNum); - void retireBlock(MachineBasicBlock *MBB); - void setLoopLandBlock(MachineLoop *LoopRep, MachineBasicBlock *MBB = nullptr); - - MachineBasicBlock *findNearestCommonPostDom(std::set&); - /// This is work around solution for findNearestCommonDominator not available - /// to post dom a proper fix should go to Dominators.h. - MachineBasicBlock *findNearestCommonPostDom(MachineBasicBlock *MBB1, - MachineBasicBlock *MBB2); - -private: - MBBInfoMap BlockInfoMap; - LoopLandInfoMap LLInfoMap; - std::map Visited; - MachineFunction *FuncRep; - SmallVector OrderedBlks; -}; - -int AMDGPUCFGStructurizer::getSCCNum(MachineBasicBlock *MBB) const { - MBBInfoMap::const_iterator It = BlockInfoMap.find(MBB); - if (It == BlockInfoMap.end()) - return INVALIDSCCNUM; - return (*It).second->SccNum; -} - -MachineBasicBlock *AMDGPUCFGStructurizer::getLoopLandInfo(MachineLoop *LoopRep) - const { - LoopLandInfoMap::const_iterator It = LLInfoMap.find(LoopRep); - if (It == LLInfoMap.end()) - return nullptr; - return (*It).second; -} - -bool AMDGPUCFGStructurizer::hasBackEdge(MachineBasicBlock *MBB) const { - MachineLoop *LoopRep = MLI->getLoopFor(MBB); - if (!LoopRep) - return false; - MachineBasicBlock *LoopHeader = LoopRep->getHeader(); - return MBB->isSuccessor(LoopHeader); -} - -unsigned AMDGPUCFGStructurizer::getLoopDepth(MachineLoop *LoopRep) { - return LoopRep ? LoopRep->getLoopDepth() : 0; -} - -bool AMDGPUCFGStructurizer::isRetiredBlock(MachineBasicBlock *MBB) const { - MBBInfoMap::const_iterator It = BlockInfoMap.find(MBB); - if (It == BlockInfoMap.end()) - return false; - return (*It).second->IsRetired; -} - -bool AMDGPUCFGStructurizer::isActiveLoophead(MachineBasicBlock *MBB) const { - MachineLoop *LoopRep = MLI->getLoopFor(MBB); - while (LoopRep && LoopRep->getHeader() == MBB) { - MachineBasicBlock *LoopLand = getLoopLandInfo(LoopRep); - if(!LoopLand) - return true; - if (!isRetiredBlock(LoopLand)) - return true; - LoopRep = LoopRep->getParentLoop(); - } - return false; -} -AMDGPUCFGStructurizer::PathToKind AMDGPUCFGStructurizer::singlePathTo( - MachineBasicBlock *SrcMBB, MachineBasicBlock *DstMBB, - bool AllowSideEntry) const { - assert(DstMBB); - if (SrcMBB == DstMBB) - return SinglePath_InPath; - while (SrcMBB && SrcMBB->succ_size() == 1) { - SrcMBB = *SrcMBB->succ_begin(); - if (SrcMBB == DstMBB) - return SinglePath_InPath; - if (!AllowSideEntry && SrcMBB->pred_size() > 1) - return Not_SinglePath; - } - if (SrcMBB && SrcMBB->succ_size()==0) - return SinglePath_NotInPath; - return Not_SinglePath; -} - -int AMDGPUCFGStructurizer::countActiveBlock(MBBVector::const_iterator It, - MBBVector::const_iterator E) const { - int Count = 0; - while (It != E) { - if (!isRetiredBlock(*It)) - ++Count; - ++It; - } - return Count; -} - -bool AMDGPUCFGStructurizer::needMigrateBlock(MachineBasicBlock *MBB) const { - unsigned BlockSizeThreshold = 30; - unsigned CloneInstrThreshold = 100; - bool MultiplePreds = MBB && (MBB->pred_size() > 1); - - if(!MultiplePreds) - return false; - unsigned BlkSize = MBB->size(); - return ((BlkSize > BlockSizeThreshold) && - (BlkSize * (MBB->pred_size() - 1) > CloneInstrThreshold)); -} - -void AMDGPUCFGStructurizer::reversePredicateSetter( - MachineBasicBlock::iterator I) { - while (I--) { - if (I->getOpcode() == AMDGPU::PRED_X) { - switch (static_cast(I)->getOperand(2).getImm()) { - case OPCODE_IS_ZERO_INT: - static_cast(I)->getOperand(2) - .setImm(OPCODE_IS_NOT_ZERO_INT); - return; - case OPCODE_IS_NOT_ZERO_INT: - static_cast(I)->getOperand(2) - .setImm(OPCODE_IS_ZERO_INT); - return; - case OPCODE_IS_ZERO: - static_cast(I)->getOperand(2) - .setImm(OPCODE_IS_NOT_ZERO); - return; - case OPCODE_IS_NOT_ZERO: - static_cast(I)->getOperand(2) - .setImm(OPCODE_IS_ZERO); - return; - default: - llvm_unreachable("PRED_X Opcode invalid!"); - } - } - } -} - -void AMDGPUCFGStructurizer::insertInstrEnd(MachineBasicBlock *MBB, - int NewOpcode, DebugLoc DL) { - MachineInstr *MI = MBB->getParent() - ->CreateMachineInstr(TII->get(NewOpcode), DL); - MBB->push_back(MI); - //assume the instruction doesn't take any reg operand ... - SHOWNEWINSTR(MI); -} - -MachineInstr *AMDGPUCFGStructurizer::insertInstrBefore(MachineBasicBlock *MBB, - int NewOpcode, DebugLoc DL) { - MachineInstr *MI = - MBB->getParent()->CreateMachineInstr(TII->get(NewOpcode), DL); - if (MBB->begin() != MBB->end()) - MBB->insert(MBB->begin(), MI); - else - MBB->push_back(MI); - SHOWNEWINSTR(MI); - return MI; -} - -MachineInstr *AMDGPUCFGStructurizer::insertInstrBefore( - MachineBasicBlock::iterator I, int NewOpcode) { - MachineInstr *OldMI = &(*I); - MachineBasicBlock *MBB = OldMI->getParent(); - MachineInstr *NewMBB = - MBB->getParent()->CreateMachineInstr(TII->get(NewOpcode), DebugLoc()); - MBB->insert(I, NewMBB); - //assume the instruction doesn't take any reg operand ... - SHOWNEWINSTR(NewMBB); - return NewMBB; -} - -void AMDGPUCFGStructurizer::insertCondBranchBefore( - MachineBasicBlock::iterator I, int NewOpcode, DebugLoc DL) { - MachineInstr *OldMI = &(*I); - MachineBasicBlock *MBB = OldMI->getParent(); - MachineFunction *MF = MBB->getParent(); - MachineInstr *NewMI = MF->CreateMachineInstr(TII->get(NewOpcode), DL); - MBB->insert(I, NewMI); - MachineInstrBuilder MIB(*MF, NewMI); - MIB.addReg(OldMI->getOperand(1).getReg(), false); - SHOWNEWINSTR(NewMI); - //erase later oldInstr->eraseFromParent(); -} - -void AMDGPUCFGStructurizer::insertCondBranchBefore(MachineBasicBlock *blk, - MachineBasicBlock::iterator I, int NewOpcode, int RegNum, - DebugLoc DL) { - MachineFunction *MF = blk->getParent(); - MachineInstr *NewInstr = MF->CreateMachineInstr(TII->get(NewOpcode), DL); - //insert before - blk->insert(I, NewInstr); - MachineInstrBuilder(*MF, NewInstr).addReg(RegNum, false); - SHOWNEWINSTR(NewInstr); -} - -void AMDGPUCFGStructurizer::insertCondBranchEnd(MachineBasicBlock *MBB, - int NewOpcode, int RegNum) { - MachineFunction *MF = MBB->getParent(); - MachineInstr *NewInstr = - MF->CreateMachineInstr(TII->get(NewOpcode), DebugLoc()); - MBB->push_back(NewInstr); - MachineInstrBuilder(*MF, NewInstr).addReg(RegNum, false); - SHOWNEWINSTR(NewInstr); -} - -int AMDGPUCFGStructurizer::getBranchNzeroOpcode(int OldOpcode) { - switch(OldOpcode) { - case AMDGPU::JUMP_COND: - case AMDGPU::JUMP: return AMDGPU::IF_PREDICATE_SET; - case AMDGPU::BRANCH_COND_i32: - case AMDGPU::BRANCH_COND_f32: return AMDGPU::IF_LOGICALNZ_f32; - default: llvm_unreachable("internal error"); - } - return -1; -} - -int AMDGPUCFGStructurizer::getBranchZeroOpcode(int OldOpcode) { - switch(OldOpcode) { - case AMDGPU::JUMP_COND: - case AMDGPU::JUMP: return AMDGPU::IF_PREDICATE_SET; - case AMDGPU::BRANCH_COND_i32: - case AMDGPU::BRANCH_COND_f32: return AMDGPU::IF_LOGICALZ_f32; - default: llvm_unreachable("internal error"); - } - return -1; -} - -int AMDGPUCFGStructurizer::getContinueNzeroOpcode(int OldOpcode) { - switch(OldOpcode) { - case AMDGPU::JUMP_COND: - case AMDGPU::JUMP: return AMDGPU::CONTINUE_LOGICALNZ_i32; - default: llvm_unreachable("internal error"); - }; - return -1; -} - -int AMDGPUCFGStructurizer::getContinueZeroOpcode(int OldOpcode) { - switch(OldOpcode) { - case AMDGPU::JUMP_COND: - case AMDGPU::JUMP: return AMDGPU::CONTINUE_LOGICALZ_i32; - default: llvm_unreachable("internal error"); - } - return -1; -} - -MachineBasicBlock *AMDGPUCFGStructurizer::getTrueBranch(MachineInstr *MI) { - return MI->getOperand(0).getMBB(); -} - -void AMDGPUCFGStructurizer::setTrueBranch(MachineInstr *MI, - MachineBasicBlock *MBB) { - MI->getOperand(0).setMBB(MBB); -} - -MachineBasicBlock * -AMDGPUCFGStructurizer::getFalseBranch(MachineBasicBlock *MBB, - MachineInstr *MI) { - assert(MBB->succ_size() == 2); - MachineBasicBlock *TrueBranch = getTrueBranch(MI); - MachineBasicBlock::succ_iterator It = MBB->succ_begin(); - MachineBasicBlock::succ_iterator Next = It; - ++Next; - return (*It == TrueBranch) ? *Next : *It; -} - -bool AMDGPUCFGStructurizer::isCondBranch(MachineInstr *MI) { - switch (MI->getOpcode()) { - case AMDGPU::JUMP_COND: - case AMDGPU::BRANCH_COND_i32: - case AMDGPU::BRANCH_COND_f32: return true; - default: - return false; - } - return false; -} - -bool AMDGPUCFGStructurizer::isUncondBranch(MachineInstr *MI) { - switch (MI->getOpcode()) { - case AMDGPU::JUMP: - case AMDGPU::BRANCH: - return true; - default: - return false; - } - return false; -} - -DebugLoc AMDGPUCFGStructurizer::getLastDebugLocInBB(MachineBasicBlock *MBB) { - //get DebugLoc from the first MachineBasicBlock instruction with debug info - DebugLoc DL; - for (MachineBasicBlock::iterator It = MBB->begin(); It != MBB->end(); - ++It) { - MachineInstr *instr = &(*It); - if (instr->getDebugLoc()) - DL = instr->getDebugLoc(); - } - return DL; -} - -MachineInstr *AMDGPUCFGStructurizer::getNormalBlockBranchInstr( - MachineBasicBlock *MBB) { - MachineBasicBlock::reverse_iterator It = MBB->rbegin(); - MachineInstr *MI = &*It; - if (MI && (isCondBranch(MI) || isUncondBranch(MI))) - return MI; - return nullptr; -} - -MachineInstr *AMDGPUCFGStructurizer::getLoopendBlockBranchInstr( - MachineBasicBlock *MBB) { - for (MachineBasicBlock::reverse_iterator It = MBB->rbegin(), E = MBB->rend(); - It != E; ++It) { - // FIXME: Simplify - MachineInstr *MI = &*It; - if (MI) { - if (isCondBranch(MI) || isUncondBranch(MI)) - return MI; - else if (!TII->isMov(MI->getOpcode())) - break; - } - } - return nullptr; -} - -MachineInstr *AMDGPUCFGStructurizer::getReturnInstr(MachineBasicBlock *MBB) { - MachineBasicBlock::reverse_iterator It = MBB->rbegin(); - if (It != MBB->rend()) { - MachineInstr *instr = &(*It); - if (instr->getOpcode() == AMDGPU::RETURN) - return instr; - } - return nullptr; -} - -MachineInstr *AMDGPUCFGStructurizer::getContinueInstr(MachineBasicBlock *MBB) { - MachineBasicBlock::reverse_iterator It = MBB->rbegin(); - if (It != MBB->rend()) { - MachineInstr *MI = &(*It); - if (MI->getOpcode() == AMDGPU::CONTINUE) - return MI; - } - return nullptr; -} - -bool AMDGPUCFGStructurizer::isReturnBlock(MachineBasicBlock *MBB) { - MachineInstr *MI = getReturnInstr(MBB); - bool IsReturn = (MBB->succ_size() == 0); - if (MI) - assert(IsReturn); - else if (IsReturn) - DEBUG( - dbgs() << "BB" << MBB->getNumber() - <<" is return block without RETURN instr\n";); - return IsReturn; -} - -void AMDGPUCFGStructurizer::cloneSuccessorList(MachineBasicBlock *DstMBB, - MachineBasicBlock *SrcMBB) { - for (MachineBasicBlock::succ_iterator It = SrcMBB->succ_begin(), - iterEnd = SrcMBB->succ_end(); It != iterEnd; ++It) - DstMBB->addSuccessor(*It); // *iter's predecessor is also taken care of -} - -MachineBasicBlock *AMDGPUCFGStructurizer::clone(MachineBasicBlock *MBB) { - MachineFunction *Func = MBB->getParent(); - MachineBasicBlock *NewMBB = Func->CreateMachineBasicBlock(); - Func->push_back(NewMBB); //insert to function - for (MachineBasicBlock::iterator It = MBB->begin(), E = MBB->end(); - It != E; ++It) { - MachineInstr *MI = Func->CloneMachineInstr(It); - NewMBB->push_back(MI); - } - return NewMBB; -} - -void AMDGPUCFGStructurizer::replaceInstrUseOfBlockWith( - MachineBasicBlock *SrcMBB, MachineBasicBlock *OldMBB, - MachineBasicBlock *NewBlk) { - MachineInstr *BranchMI = getLoopendBlockBranchInstr(SrcMBB); - if (BranchMI && isCondBranch(BranchMI) && - getTrueBranch(BranchMI) == OldMBB) - setTrueBranch(BranchMI, NewBlk); -} - -void AMDGPUCFGStructurizer::wrapup(MachineBasicBlock *MBB) { - assert((!MBB->getParent()->getJumpTableInfo() - || MBB->getParent()->getJumpTableInfo()->isEmpty()) - && "found a jump table"); - - //collect continue right before endloop - SmallVector ContInstr; - MachineBasicBlock::iterator Pre = MBB->begin(); - MachineBasicBlock::iterator E = MBB->end(); - MachineBasicBlock::iterator It = Pre; - while (It != E) { - if (Pre->getOpcode() == AMDGPU::CONTINUE - && It->getOpcode() == AMDGPU::ENDLOOP) - ContInstr.push_back(Pre); - Pre = It; - ++It; - } - - //delete continue right before endloop - for (unsigned i = 0; i < ContInstr.size(); ++i) - ContInstr[i]->eraseFromParent(); - - // TODO to fix up jump table so later phase won't be confused. if - // (jumpTableInfo->isEmpty() == false) { need to clean the jump table, but - // there isn't such an interface yet. alternatively, replace all the other - // blocks in the jump table with the entryBlk //} - -} - - -bool AMDGPUCFGStructurizer::prepare() { - bool Changed = false; - - //FIXME: if not reducible flow graph, make it so ??? - - DEBUG(dbgs() << "AMDGPUCFGStructurizer::prepare\n";); - - orderBlocks(FuncRep); - - SmallVector RetBlks; - - // Add an ExitBlk to loop that don't have one - for (MachineLoopInfo::iterator It = MLI->begin(), - E = MLI->end(); It != E; ++It) { - MachineLoop *LoopRep = (*It); - MBBVector ExitingMBBs; - LoopRep->getExitingBlocks(ExitingMBBs); - - if (ExitingMBBs.size() == 0) { - MachineBasicBlock* DummyExitBlk = normalizeInfiniteLoopExit(LoopRep); - if (DummyExitBlk) - RetBlks.push_back(DummyExitBlk); - } - } - - // Remove unconditional branch instr. - // Add dummy exit block iff there are multiple returns. - for (SmallVectorImpl::const_iterator - It = OrderedBlks.begin(), E = OrderedBlks.end(); It != E; ++It) { - MachineBasicBlock *MBB = *It; - removeUnconditionalBranch(MBB); - removeRedundantConditionalBranch(MBB); - if (isReturnBlock(MBB)) { - RetBlks.push_back(MBB); - } - assert(MBB->succ_size() <= 2); - } - - if (RetBlks.size() >= 2) { - addDummyExitBlock(RetBlks); - Changed = true; - } - - return Changed; -} - -bool AMDGPUCFGStructurizer::run() { - - //Assume reducible CFG... - DEBUG(dbgs() << "AMDGPUCFGStructurizer::run\n"); - -#ifdef STRESSTEST - //Use the worse block ordering to test the algorithm. - ReverseVector(orderedBlks); -#endif - - DEBUG(dbgs() << "Ordered blocks:\n"; printOrderedBlocks();); - int NumIter = 0; - bool Finish = false; - MachineBasicBlock *MBB; - bool MakeProgress = false; - int NumRemainedBlk = countActiveBlock(OrderedBlks.begin(), - OrderedBlks.end()); - - do { - ++NumIter; - DEBUG( - dbgs() << "numIter = " << NumIter - << ", numRemaintedBlk = " << NumRemainedBlk << "\n"; - ); - - SmallVectorImpl::const_iterator It = - OrderedBlks.begin(); - SmallVectorImpl::const_iterator E = - OrderedBlks.end(); - - SmallVectorImpl::const_iterator SccBeginIter = - It; - MachineBasicBlock *SccBeginMBB = nullptr; - int SccNumBlk = 0; // The number of active blocks, init to a - // maximum possible number. - int SccNumIter; // Number of iteration in this SCC. - - while (It != E) { - MBB = *It; - - if (!SccBeginMBB) { - SccBeginIter = It; - SccBeginMBB = MBB; - SccNumIter = 0; - SccNumBlk = NumRemainedBlk; // Init to maximum possible number. - DEBUG( - dbgs() << "start processing SCC" << getSCCNum(SccBeginMBB); - dbgs() << "\n"; - ); - } - - if (!isRetiredBlock(MBB)) - patternMatch(MBB); - - ++It; - - bool ContNextScc = true; - if (It == E - || getSCCNum(SccBeginMBB) != getSCCNum(*It)) { - // Just finish one scc. - ++SccNumIter; - int sccRemainedNumBlk = countActiveBlock(SccBeginIter, It); - if (sccRemainedNumBlk != 1 && sccRemainedNumBlk >= SccNumBlk) { - DEBUG( - dbgs() << "Can't reduce SCC " << getSCCNum(MBB) - << ", sccNumIter = " << SccNumIter; - dbgs() << "doesn't make any progress\n"; - ); - ContNextScc = true; - } else if (sccRemainedNumBlk != 1 && sccRemainedNumBlk < SccNumBlk) { - SccNumBlk = sccRemainedNumBlk; - It = SccBeginIter; - ContNextScc = false; - DEBUG( - dbgs() << "repeat processing SCC" << getSCCNum(MBB) - << "sccNumIter = " << SccNumIter << '\n'; - ); - } else { - // Finish the current scc. - ContNextScc = true; - } - } else { - // Continue on next component in the current scc. - ContNextScc = false; - } - - if (ContNextScc) - SccBeginMBB = nullptr; - } //while, "one iteration" over the function. - - MachineBasicBlock *EntryMBB = - GraphTraits::nodes_begin(FuncRep); - if (EntryMBB->succ_size() == 0) { - Finish = true; - DEBUG( - dbgs() << "Reduce to one block\n"; - ); - } else { - int NewnumRemainedBlk - = countActiveBlock(OrderedBlks.begin(), OrderedBlks.end()); - // consider cloned blocks ?? - if (NewnumRemainedBlk == 1 || NewnumRemainedBlk < NumRemainedBlk) { - MakeProgress = true; - NumRemainedBlk = NewnumRemainedBlk; - } else { - MakeProgress = false; - DEBUG( - dbgs() << "No progress\n"; - ); - } - } - } while (!Finish && MakeProgress); - - // Misc wrap up to maintain the consistency of the Function representation. - wrapup(GraphTraits::nodes_begin(FuncRep)); - - // Detach retired Block, release memory. - for (MBBInfoMap::iterator It = BlockInfoMap.begin(), E = BlockInfoMap.end(); - It != E; ++It) { - if ((*It).second && (*It).second->IsRetired) { - assert(((*It).first)->getNumber() != -1); - DEBUG( - dbgs() << "Erase BB" << ((*It).first)->getNumber() << "\n"; - ); - (*It).first->eraseFromParent(); //Remove from the parent Function. - } - delete (*It).second; - } - BlockInfoMap.clear(); - LLInfoMap.clear(); - - if (!Finish) { - DEBUG(FuncRep->viewCFG()); - llvm_unreachable("IRREDUCIBLE_CFG"); - } - - return true; -} - - - -void AMDGPUCFGStructurizer::orderBlocks(MachineFunction *MF) { - int SccNum = 0; - MachineBasicBlock *MBB; - for (scc_iterator It = scc_begin(MF); !It.isAtEnd(); - ++It, ++SccNum) { - const std::vector &SccNext = *It; - for (std::vector::const_iterator - blockIter = SccNext.begin(), blockEnd = SccNext.end(); - blockIter != blockEnd; ++blockIter) { - MBB = *blockIter; - OrderedBlks.push_back(MBB); - recordSccnum(MBB, SccNum); - } - } - - //walk through all the block in func to check for unreachable - typedef GraphTraits GTM; - MachineFunction::iterator It = GTM::nodes_begin(MF), E = GTM::nodes_end(MF); - for (; It != E; ++It) { - MachineBasicBlock *MBB = &(*It); - SccNum = getSCCNum(MBB); - if (SccNum == INVALIDSCCNUM) - dbgs() << "unreachable block BB" << MBB->getNumber() << "\n"; - } -} - -int AMDGPUCFGStructurizer::patternMatch(MachineBasicBlock *MBB) { - int NumMatch = 0; - int CurMatch; - - DEBUG( - dbgs() << "Begin patternMatch BB" << MBB->getNumber() << "\n"; - ); - - while ((CurMatch = patternMatchGroup(MBB)) > 0) - NumMatch += CurMatch; - - DEBUG( - dbgs() << "End patternMatch BB" << MBB->getNumber() - << ", numMatch = " << NumMatch << "\n"; - ); - - return NumMatch; -} - -int AMDGPUCFGStructurizer::patternMatchGroup(MachineBasicBlock *MBB) { - int NumMatch = 0; - NumMatch += loopendPatternMatch(); - NumMatch += serialPatternMatch(MBB); - NumMatch += ifPatternMatch(MBB); - return NumMatch; -} - - -int AMDGPUCFGStructurizer::serialPatternMatch(MachineBasicBlock *MBB) { - if (MBB->succ_size() != 1) - return 0; - - MachineBasicBlock *childBlk = *MBB->succ_begin(); - if (childBlk->pred_size() != 1 || isActiveLoophead(childBlk)) - return 0; - - mergeSerialBlock(MBB, childBlk); - ++numSerialPatternMatch; - return 1; -} - -int AMDGPUCFGStructurizer::ifPatternMatch(MachineBasicBlock *MBB) { - //two edges - if (MBB->succ_size() != 2) - return 0; - if (hasBackEdge(MBB)) - return 0; - MachineInstr *BranchMI = getNormalBlockBranchInstr(MBB); - if (!BranchMI) - return 0; - - assert(isCondBranch(BranchMI)); - int NumMatch = 0; - - MachineBasicBlock *TrueMBB = getTrueBranch(BranchMI); - NumMatch += serialPatternMatch(TrueMBB); - NumMatch += ifPatternMatch(TrueMBB); - MachineBasicBlock *FalseMBB = getFalseBranch(MBB, BranchMI); - NumMatch += serialPatternMatch(FalseMBB); - NumMatch += ifPatternMatch(FalseMBB); - MachineBasicBlock *LandBlk; - int Cloned = 0; - - assert (!TrueMBB->succ_empty() || !FalseMBB->succ_empty()); - // TODO: Simplify - if (TrueMBB->succ_size() == 1 && FalseMBB->succ_size() == 1 - && *TrueMBB->succ_begin() == *FalseMBB->succ_begin()) { - // Diamond pattern - LandBlk = *TrueMBB->succ_begin(); - } else if (TrueMBB->succ_size() == 1 && *TrueMBB->succ_begin() == FalseMBB) { - // Triangle pattern, false is empty - LandBlk = FalseMBB; - FalseMBB = nullptr; - } else if (FalseMBB->succ_size() == 1 - && *FalseMBB->succ_begin() == TrueMBB) { - // Triangle pattern, true is empty - // We reverse the predicate to make a triangle, empty false pattern; - std::swap(TrueMBB, FalseMBB); - reversePredicateSetter(MBB->end()); - LandBlk = FalseMBB; - FalseMBB = nullptr; - } else if (FalseMBB->succ_size() == 1 - && isSameloopDetachedContbreak(TrueMBB, FalseMBB)) { - LandBlk = *FalseMBB->succ_begin(); - } else if (TrueMBB->succ_size() == 1 - && isSameloopDetachedContbreak(FalseMBB, TrueMBB)) { - LandBlk = *TrueMBB->succ_begin(); - } else { - return NumMatch + handleJumpintoIf(MBB, TrueMBB, FalseMBB); - } - - // improveSimpleJumpinfoIf can handle the case where landBlk == NULL but the - // new BB created for landBlk==NULL may introduce new challenge to the - // reduction process. - if (LandBlk && - ((TrueMBB && TrueMBB->pred_size() > 1) - || (FalseMBB && FalseMBB->pred_size() > 1))) { - Cloned += improveSimpleJumpintoIf(MBB, TrueMBB, FalseMBB, &LandBlk); - } - - if (TrueMBB && TrueMBB->pred_size() > 1) { - TrueMBB = cloneBlockForPredecessor(TrueMBB, MBB); - ++Cloned; - } - - if (FalseMBB && FalseMBB->pred_size() > 1) { - FalseMBB = cloneBlockForPredecessor(FalseMBB, MBB); - ++Cloned; - } - - mergeIfthenelseBlock(BranchMI, MBB, TrueMBB, FalseMBB, LandBlk); - - ++numIfPatternMatch; - - numClonedBlock += Cloned; - - return 1 + Cloned + NumMatch; -} - -int AMDGPUCFGStructurizer::loopendPatternMatch() { - std::deque NestedLoops; - for (auto &It: *MLI) - for (MachineLoop *ML : depth_first(It)) - NestedLoops.push_front(ML); - - if (NestedLoops.size() == 0) - return 0; - - // Process nested loop outside->inside (we did push_front), - // so "continue" to a outside loop won't be mistaken as "break" - // of the current loop. - int Num = 0; - for (MachineLoop *ExaminedLoop : NestedLoops) { - if (ExaminedLoop->getNumBlocks() == 0 || Visited[ExaminedLoop]) - continue; - DEBUG(dbgs() << "Processing:\n"; ExaminedLoop->dump();); - int NumBreak = mergeLoop(ExaminedLoop); - if (NumBreak == -1) - break; - Num += NumBreak; - } - return Num; -} - -int AMDGPUCFGStructurizer::mergeLoop(MachineLoop *LoopRep) { - MachineBasicBlock *LoopHeader = LoopRep->getHeader(); - MBBVector ExitingMBBs; - LoopRep->getExitingBlocks(ExitingMBBs); - assert(!ExitingMBBs.empty() && "Infinite Loop not supported"); - DEBUG(dbgs() << "Loop has " << ExitingMBBs.size() << " exiting blocks\n";); - // We assume a single ExitBlk - MBBVector ExitBlks; - LoopRep->getExitBlocks(ExitBlks); - SmallPtrSet ExitBlkSet; - for (unsigned i = 0, e = ExitBlks.size(); i < e; ++i) - ExitBlkSet.insert(ExitBlks[i]); - assert(ExitBlkSet.size() == 1); - MachineBasicBlock *ExitBlk = *ExitBlks.begin(); - assert(ExitBlk && "Loop has several exit block"); - MBBVector LatchBlks; - typedef GraphTraits > InvMBBTraits; - InvMBBTraits::ChildIteratorType PI = InvMBBTraits::child_begin(LoopHeader), - PE = InvMBBTraits::child_end(LoopHeader); - for (; PI != PE; PI++) { - if (LoopRep->contains(*PI)) - LatchBlks.push_back(*PI); - } - - for (unsigned i = 0, e = ExitingMBBs.size(); i < e; ++i) - mergeLoopbreakBlock(ExitingMBBs[i], ExitBlk); - for (unsigned i = 0, e = LatchBlks.size(); i < e; ++i) - settleLoopcontBlock(LatchBlks[i], LoopHeader); - int Match = 0; - do { - Match = 0; - Match += serialPatternMatch(LoopHeader); - Match += ifPatternMatch(LoopHeader); - } while (Match > 0); - mergeLooplandBlock(LoopHeader, ExitBlk); - MachineLoop *ParentLoop = LoopRep->getParentLoop(); - if (ParentLoop) - MLI->changeLoopFor(LoopHeader, ParentLoop); - else - MLI->removeBlock(LoopHeader); - Visited[LoopRep] = true; - return 1; -} - -int AMDGPUCFGStructurizer::loopcontPatternMatch(MachineLoop *LoopRep, - MachineBasicBlock *LoopHeader) { - int NumCont = 0; - SmallVector ContMBB; - typedef GraphTraits > GTIM; - GTIM::ChildIteratorType It = GTIM::child_begin(LoopHeader), - E = GTIM::child_end(LoopHeader); - for (; It != E; ++It) { - MachineBasicBlock *MBB = *It; - if (LoopRep->contains(MBB)) { - handleLoopcontBlock(MBB, MLI->getLoopFor(MBB), - LoopHeader, LoopRep); - ContMBB.push_back(MBB); - ++NumCont; - } - } - - for (SmallVectorImpl::iterator It = ContMBB.begin(), - E = ContMBB.end(); It != E; ++It) { - (*It)->removeSuccessor(LoopHeader); - } - - numLoopcontPatternMatch += NumCont; - - return NumCont; -} - - -bool AMDGPUCFGStructurizer::isSameloopDetachedContbreak( - MachineBasicBlock *Src1MBB, MachineBasicBlock *Src2MBB) { - if (Src1MBB->succ_size() == 0) { - MachineLoop *LoopRep = MLI->getLoopFor(Src1MBB); - if (LoopRep&& LoopRep == MLI->getLoopFor(Src2MBB)) { - MachineBasicBlock *&TheEntry = LLInfoMap[LoopRep]; - if (TheEntry) { - DEBUG( - dbgs() << "isLoopContBreakBlock yes src1 = BB" - << Src1MBB->getNumber() - << " src2 = BB" << Src2MBB->getNumber() << "\n"; - ); - return true; - } - } - } - return false; -} - -int AMDGPUCFGStructurizer::handleJumpintoIf(MachineBasicBlock *HeadMBB, - MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB) { - int Num = handleJumpintoIfImp(HeadMBB, TrueMBB, FalseMBB); - if (Num == 0) { - DEBUG( - dbgs() << "handleJumpintoIf swap trueBlk and FalseBlk" << "\n"; - ); - Num = handleJumpintoIfImp(HeadMBB, FalseMBB, TrueMBB); - } - return Num; -} - -int AMDGPUCFGStructurizer::handleJumpintoIfImp(MachineBasicBlock *HeadMBB, - MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB) { - int Num = 0; - MachineBasicBlock *DownBlk; - - //trueBlk could be the common post dominator - DownBlk = TrueMBB; - - DEBUG( - dbgs() << "handleJumpintoIfImp head = BB" << HeadMBB->getNumber() - << " true = BB" << TrueMBB->getNumber() - << ", numSucc=" << TrueMBB->succ_size() - << " false = BB" << FalseMBB->getNumber() << "\n"; - ); - - while (DownBlk) { - DEBUG( - dbgs() << "check down = BB" << DownBlk->getNumber(); - ); - - if (singlePathTo(FalseMBB, DownBlk) == SinglePath_InPath) { - DEBUG( - dbgs() << " working\n"; - ); - - Num += cloneOnSideEntryTo(HeadMBB, TrueMBB, DownBlk); - Num += cloneOnSideEntryTo(HeadMBB, FalseMBB, DownBlk); - - numClonedBlock += Num; - Num += serialPatternMatch(*HeadMBB->succ_begin()); - Num += serialPatternMatch(*std::next(HeadMBB->succ_begin())); - Num += ifPatternMatch(HeadMBB); - assert(Num > 0); - - break; - } - DEBUG( - dbgs() << " not working\n"; - ); - DownBlk = (DownBlk->succ_size() == 1) ? (*DownBlk->succ_begin()) : nullptr; - } // walk down the postDomTree - - return Num; -} - -void AMDGPUCFGStructurizer::showImproveSimpleJumpintoIf( - MachineBasicBlock *HeadMBB, MachineBasicBlock *TrueMBB, - MachineBasicBlock *FalseMBB, MachineBasicBlock *LandMBB, bool Detail) { - dbgs() << "head = BB" << HeadMBB->getNumber() - << " size = " << HeadMBB->size(); - if (Detail) { - dbgs() << "\n"; - HeadMBB->print(dbgs()); - dbgs() << "\n"; - } - - if (TrueMBB) { - dbgs() << ", true = BB" << TrueMBB->getNumber() << " size = " - << TrueMBB->size() << " numPred = " << TrueMBB->pred_size(); - if (Detail) { - dbgs() << "\n"; - TrueMBB->print(dbgs()); - dbgs() << "\n"; - } - } - if (FalseMBB) { - dbgs() << ", false = BB" << FalseMBB->getNumber() << " size = " - << FalseMBB->size() << " numPred = " << FalseMBB->pred_size(); - if (Detail) { - dbgs() << "\n"; - FalseMBB->print(dbgs()); - dbgs() << "\n"; - } - } - if (LandMBB) { - dbgs() << ", land = BB" << LandMBB->getNumber() << " size = " - << LandMBB->size() << " numPred = " << LandMBB->pred_size(); - if (Detail) { - dbgs() << "\n"; - LandMBB->print(dbgs()); - dbgs() << "\n"; - } - } - - dbgs() << "\n"; -} - -int AMDGPUCFGStructurizer::improveSimpleJumpintoIf(MachineBasicBlock *HeadMBB, - MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB, - MachineBasicBlock **LandMBBPtr) { - bool MigrateTrue = false; - bool MigrateFalse = false; - - MachineBasicBlock *LandBlk = *LandMBBPtr; - - assert((!TrueMBB || TrueMBB->succ_size() <= 1) - && (!FalseMBB || FalseMBB->succ_size() <= 1)); - - if (TrueMBB == FalseMBB) - return 0; - - MigrateTrue = needMigrateBlock(TrueMBB); - MigrateFalse = needMigrateBlock(FalseMBB); - - if (!MigrateTrue && !MigrateFalse) - return 0; - - // If we need to migrate either trueBlk and falseBlk, migrate the rest that - // have more than one predecessors. without doing this, its predecessor - // rather than headBlk will have undefined value in initReg. - if (!MigrateTrue && TrueMBB && TrueMBB->pred_size() > 1) - MigrateTrue = true; - if (!MigrateFalse && FalseMBB && FalseMBB->pred_size() > 1) - MigrateFalse = true; - - DEBUG( - dbgs() << "before improveSimpleJumpintoIf: "; - showImproveSimpleJumpintoIf(HeadMBB, TrueMBB, FalseMBB, LandBlk, 0); - ); - - // org: headBlk => if () {trueBlk} else {falseBlk} => landBlk - // - // new: headBlk => if () {initReg = 1; org trueBlk branch} else - // {initReg = 0; org falseBlk branch } - // => landBlk => if (initReg) {org trueBlk} else {org falseBlk} - // => org landBlk - // if landBlk->pred_size() > 2, put the about if-else inside - // if (initReg !=2) {...} - // - // add initReg = initVal to headBlk - - const TargetRegisterClass * I32RC = TRI->getCFGStructurizerRegClass(MVT::i32); - if (!MigrateTrue || !MigrateFalse) { - // XXX: We have an opportunity here to optimize the "branch into if" case - // here. Branch into if looks like this: - // entry - // / | - // diamond_head branch_from - // / \ | - // diamond_false diamond_true - // \ / - // done - // - // The diamond_head block begins the "if" and the diamond_true block - // is the block being "branched into". - // - // If MigrateTrue is true, then TrueBB is the block being "branched into" - // and if MigrateFalse is true, then FalseBB is the block being - // "branched into" - // - // Here is the pseudo code for how I think the optimization should work: - // 1. Insert MOV GPR0, 0 before the branch instruction in diamond_head. - // 2. Insert MOV GPR0, 1 before the branch instruction in branch_from. - // 3. Move the branch instruction from diamond_head into its own basic - // block (new_block). - // 4. Add an unconditional branch from diamond_head to new_block - // 5. Replace the branch instruction in branch_from with an unconditional - // branch to new_block. If branch_from has multiple predecessors, then - // we need to replace the True/False block in the branch - // instruction instead of replacing it. - // 6. Change the condition of the branch instruction in new_block from - // COND to (COND || GPR0) - // - // In order insert these MOV instruction, we will need to use the - // RegisterScavenger. Usually liveness stops being tracked during - // the late machine optimization passes, however if we implement - // bool TargetRegisterInfo::requiresRegisterScavenging( - // const MachineFunction &MF) - // and have it return true, liveness will be tracked correctly - // by generic optimization passes. We will also need to make sure that - // all of our target-specific passes that run after regalloc and before - // the CFGStructurizer track liveness and we will need to modify this pass - // to correctly track liveness. - // - // After the above changes, the new CFG should look like this: - // entry - // / | - // diamond_head branch_from - // \ / - // new_block - // / | - // diamond_false diamond_true - // \ / - // done - // - // Without this optimization, we are forced to duplicate the diamond_true - // block and we will end up with a CFG like this: - // - // entry - // / | - // diamond_head branch_from - // / \ | - // diamond_false diamond_true diamond_true (duplicate) - // \ / | - // done --------------------| - // - // Duplicating diamond_true can be very costly especially if it has a - // lot of instructions. - return 0; - } - - int NumNewBlk = 0; - - bool LandBlkHasOtherPred = (LandBlk->pred_size() > 2); - - //insert AMDGPU::ENDIF to avoid special case "input landBlk == NULL" - MachineBasicBlock::iterator I = insertInstrBefore(LandBlk, AMDGPU::ENDIF); - - if (LandBlkHasOtherPred) { - llvm_unreachable("Extra register needed to handle CFG"); - unsigned CmpResReg = - HeadMBB->getParent()->getRegInfo().createVirtualRegister(I32RC); - llvm_unreachable("Extra compare instruction needed to handle CFG"); - insertCondBranchBefore(LandBlk, I, AMDGPU::IF_PREDICATE_SET, - CmpResReg, DebugLoc()); - } - - // XXX: We are running this after RA, so creating virtual registers will - // cause an assertion failure in the PostRA scheduling pass. - unsigned InitReg = - HeadMBB->getParent()->getRegInfo().createVirtualRegister(I32RC); - insertCondBranchBefore(LandBlk, I, AMDGPU::IF_PREDICATE_SET, InitReg, - DebugLoc()); - - if (MigrateTrue) { - migrateInstruction(TrueMBB, LandBlk, I); - // need to uncondionally insert the assignment to ensure a path from its - // predecessor rather than headBlk has valid value in initReg if - // (initVal != 1). - llvm_unreachable("Extra register needed to handle CFG"); - } - insertInstrBefore(I, AMDGPU::ELSE); - - if (MigrateFalse) { - migrateInstruction(FalseMBB, LandBlk, I); - // need to uncondionally insert the assignment to ensure a path from its - // predecessor rather than headBlk has valid value in initReg if - // (initVal != 0) - llvm_unreachable("Extra register needed to handle CFG"); - } - - if (LandBlkHasOtherPred) { - // add endif - insertInstrBefore(I, AMDGPU::ENDIF); - - // put initReg = 2 to other predecessors of landBlk - for (MachineBasicBlock::pred_iterator PI = LandBlk->pred_begin(), - PE = LandBlk->pred_end(); PI != PE; ++PI) { - MachineBasicBlock *MBB = *PI; - if (MBB != TrueMBB && MBB != FalseMBB) - llvm_unreachable("Extra register needed to handle CFG"); - } - } - DEBUG( - dbgs() << "result from improveSimpleJumpintoIf: "; - showImproveSimpleJumpintoIf(HeadMBB, TrueMBB, FalseMBB, LandBlk, 0); - ); - - // update landBlk - *LandMBBPtr = LandBlk; - - return NumNewBlk; -} - -void AMDGPUCFGStructurizer::handleLoopcontBlock(MachineBasicBlock *ContingMBB, - MachineLoop *ContingLoop, MachineBasicBlock *ContMBB, - MachineLoop *ContLoop) { - DEBUG(dbgs() << "loopcontPattern cont = BB" << ContingMBB->getNumber() - << " header = BB" << ContMBB->getNumber() << "\n"; - dbgs() << "Trying to continue loop-depth = " - << getLoopDepth(ContLoop) - << " from loop-depth = " << getLoopDepth(ContingLoop) << "\n";); - settleLoopcontBlock(ContingMBB, ContMBB); -} - -void AMDGPUCFGStructurizer::mergeSerialBlock(MachineBasicBlock *DstMBB, - MachineBasicBlock *SrcMBB) { - DEBUG( - dbgs() << "serialPattern BB" << DstMBB->getNumber() - << " <= BB" << SrcMBB->getNumber() << "\n"; - ); - DstMBB->splice(DstMBB->end(), SrcMBB, SrcMBB->begin(), SrcMBB->end()); - - DstMBB->removeSuccessor(SrcMBB); - cloneSuccessorList(DstMBB, SrcMBB); - - removeSuccessor(SrcMBB); - MLI->removeBlock(SrcMBB); - retireBlock(SrcMBB); -} - -void AMDGPUCFGStructurizer::mergeIfthenelseBlock(MachineInstr *BranchMI, - MachineBasicBlock *MBB, MachineBasicBlock *TrueMBB, - MachineBasicBlock *FalseMBB, MachineBasicBlock *LandMBB) { - assert (TrueMBB); - DEBUG( - dbgs() << "ifPattern BB" << MBB->getNumber(); - dbgs() << "{ "; - if (TrueMBB) { - dbgs() << "BB" << TrueMBB->getNumber(); - } - dbgs() << " } else "; - dbgs() << "{ "; - if (FalseMBB) { - dbgs() << "BB" << FalseMBB->getNumber(); - } - dbgs() << " }\n "; - dbgs() << "landBlock: "; - if (!LandMBB) { - dbgs() << "NULL"; - } else { - dbgs() << "BB" << LandMBB->getNumber(); - } - dbgs() << "\n"; - ); - - int OldOpcode = BranchMI->getOpcode(); - DebugLoc BranchDL = BranchMI->getDebugLoc(); - -// transform to -// if cond -// trueBlk -// else -// falseBlk -// endif -// landBlk - - MachineBasicBlock::iterator I = BranchMI; - insertCondBranchBefore(I, getBranchNzeroOpcode(OldOpcode), - BranchDL); - - if (TrueMBB) { - MBB->splice(I, TrueMBB, TrueMBB->begin(), TrueMBB->end()); - MBB->removeSuccessor(TrueMBB); - if (LandMBB && TrueMBB->succ_size()!=0) - TrueMBB->removeSuccessor(LandMBB); - retireBlock(TrueMBB); - MLI->removeBlock(TrueMBB); - } - - if (FalseMBB) { - insertInstrBefore(I, AMDGPU::ELSE); - MBB->splice(I, FalseMBB, FalseMBB->begin(), - FalseMBB->end()); - MBB->removeSuccessor(FalseMBB); - if (LandMBB && FalseMBB->succ_size() != 0) - FalseMBB->removeSuccessor(LandMBB); - retireBlock(FalseMBB); - MLI->removeBlock(FalseMBB); - } - insertInstrBefore(I, AMDGPU::ENDIF); - - BranchMI->eraseFromParent(); - - if (LandMBB && TrueMBB && FalseMBB) - MBB->addSuccessor(LandMBB); - -} - -void AMDGPUCFGStructurizer::mergeLooplandBlock(MachineBasicBlock *DstBlk, - MachineBasicBlock *LandMBB) { - DEBUG(dbgs() << "loopPattern header = BB" << DstBlk->getNumber() - << " land = BB" << LandMBB->getNumber() << "\n";); - - insertInstrBefore(DstBlk, AMDGPU::WHILELOOP, DebugLoc()); - insertInstrEnd(DstBlk, AMDGPU::ENDLOOP, DebugLoc()); - DstBlk->addSuccessor(LandMBB); - DstBlk->removeSuccessor(DstBlk); -} - - -void AMDGPUCFGStructurizer::mergeLoopbreakBlock(MachineBasicBlock *ExitingMBB, - MachineBasicBlock *LandMBB) { - DEBUG(dbgs() << "loopbreakPattern exiting = BB" << ExitingMBB->getNumber() - << " land = BB" << LandMBB->getNumber() << "\n";); - MachineInstr *BranchMI = getLoopendBlockBranchInstr(ExitingMBB); - assert(BranchMI && isCondBranch(BranchMI)); - DebugLoc DL = BranchMI->getDebugLoc(); - MachineBasicBlock *TrueBranch = getTrueBranch(BranchMI); - MachineBasicBlock::iterator I = BranchMI; - if (TrueBranch != LandMBB) - reversePredicateSetter(I); - insertCondBranchBefore(ExitingMBB, I, AMDGPU::IF_PREDICATE_SET, AMDGPU::PREDICATE_BIT, DL); - insertInstrBefore(I, AMDGPU::BREAK); - insertInstrBefore(I, AMDGPU::ENDIF); - //now branchInst can be erase safely - BranchMI->eraseFromParent(); - //now take care of successors, retire blocks - ExitingMBB->removeSuccessor(LandMBB); -} - -void AMDGPUCFGStructurizer::settleLoopcontBlock(MachineBasicBlock *ContingMBB, - MachineBasicBlock *ContMBB) { - DEBUG(dbgs() << "settleLoopcontBlock conting = BB" - << ContingMBB->getNumber() - << ", cont = BB" << ContMBB->getNumber() << "\n";); - - MachineInstr *MI = getLoopendBlockBranchInstr(ContingMBB); - if (MI) { - assert(isCondBranch(MI)); - MachineBasicBlock::iterator I = MI; - MachineBasicBlock *TrueBranch = getTrueBranch(MI); - int OldOpcode = MI->getOpcode(); - DebugLoc DL = MI->getDebugLoc(); - - bool UseContinueLogical = ((&*ContingMBB->rbegin()) == MI); - - if (!UseContinueLogical) { - int BranchOpcode = - TrueBranch == ContMBB ? getBranchNzeroOpcode(OldOpcode) : - getBranchZeroOpcode(OldOpcode); - insertCondBranchBefore(I, BranchOpcode, DL); - // insertEnd to ensure phi-moves, if exist, go before the continue-instr. - insertInstrEnd(ContingMBB, AMDGPU::CONTINUE, DL); - insertInstrEnd(ContingMBB, AMDGPU::ENDIF, DL); - } else { - int BranchOpcode = - TrueBranch == ContMBB ? getContinueNzeroOpcode(OldOpcode) : - getContinueZeroOpcode(OldOpcode); - insertCondBranchBefore(I, BranchOpcode, DL); - } - - MI->eraseFromParent(); - } else { - // if we've arrived here then we've already erased the branch instruction - // travel back up the basic block to see the last reference of our debug - // location we've just inserted that reference here so it should be - // representative insertEnd to ensure phi-moves, if exist, go before the - // continue-instr. - insertInstrEnd(ContingMBB, AMDGPU::CONTINUE, - getLastDebugLocInBB(ContingMBB)); - } -} - -int AMDGPUCFGStructurizer::cloneOnSideEntryTo(MachineBasicBlock *PreMBB, - MachineBasicBlock *SrcMBB, MachineBasicBlock *DstMBB) { - int Cloned = 0; - assert(PreMBB->isSuccessor(SrcMBB)); - while (SrcMBB && SrcMBB != DstMBB) { - assert(SrcMBB->succ_size() == 1); - if (SrcMBB->pred_size() > 1) { - SrcMBB = cloneBlockForPredecessor(SrcMBB, PreMBB); - ++Cloned; - } - - PreMBB = SrcMBB; - SrcMBB = *SrcMBB->succ_begin(); - } - - return Cloned; -} - -MachineBasicBlock * -AMDGPUCFGStructurizer::cloneBlockForPredecessor(MachineBasicBlock *MBB, - MachineBasicBlock *PredMBB) { - assert(PredMBB->isSuccessor(MBB) && - "succBlk is not a prececessor of curBlk"); - - MachineBasicBlock *CloneMBB = clone(MBB); //clone instructions - replaceInstrUseOfBlockWith(PredMBB, MBB, CloneMBB); - //srcBlk, oldBlk, newBlk - - PredMBB->removeSuccessor(MBB); - PredMBB->addSuccessor(CloneMBB); - - // add all successor to cloneBlk - cloneSuccessorList(CloneMBB, MBB); - - numClonedInstr += MBB->size(); - - DEBUG( - dbgs() << "Cloned block: " << "BB" - << MBB->getNumber() << "size " << MBB->size() << "\n"; - ); - - SHOWNEWBLK(CloneMBB, "result of Cloned block: "); - - return CloneMBB; -} - -void AMDGPUCFGStructurizer::migrateInstruction(MachineBasicBlock *SrcMBB, - MachineBasicBlock *DstMBB, MachineBasicBlock::iterator I) { - MachineBasicBlock::iterator SpliceEnd; - //look for the input branchinstr, not the AMDGPU branchinstr - MachineInstr *BranchMI = getNormalBlockBranchInstr(SrcMBB); - if (!BranchMI) { - DEBUG( - dbgs() << "migrateInstruction don't see branch instr\n" ; - ); - SpliceEnd = SrcMBB->end(); - } else { - DEBUG( - dbgs() << "migrateInstruction see branch instr\n" ; - BranchMI->dump(); - ); - SpliceEnd = BranchMI; - } - DEBUG( - dbgs() << "migrateInstruction before splice dstSize = " << DstMBB->size() - << "srcSize = " << SrcMBB->size() << "\n"; - ); - - //splice insert before insertPos - DstMBB->splice(I, SrcMBB, SrcMBB->begin(), SpliceEnd); - - DEBUG( - dbgs() << "migrateInstruction after splice dstSize = " << DstMBB->size() - << "srcSize = " << SrcMBB->size() << "\n"; - ); -} - -MachineBasicBlock * -AMDGPUCFGStructurizer::normalizeInfiniteLoopExit(MachineLoop* LoopRep) { - MachineBasicBlock *LoopHeader = LoopRep->getHeader(); - MachineBasicBlock *LoopLatch = LoopRep->getLoopLatch(); - const TargetRegisterClass * I32RC = TRI->getCFGStructurizerRegClass(MVT::i32); - - if (!LoopHeader || !LoopLatch) - return nullptr; - MachineInstr *BranchMI = getLoopendBlockBranchInstr(LoopLatch); - // Is LoopRep an infinite loop ? - if (!BranchMI || !isUncondBranch(BranchMI)) - return nullptr; - - MachineBasicBlock *DummyExitBlk = FuncRep->CreateMachineBasicBlock(); - FuncRep->push_back(DummyExitBlk); //insert to function - SHOWNEWBLK(DummyExitBlk, "DummyExitBlock to normalize infiniteLoop: "); - DEBUG(dbgs() << "Old branch instr: " << *BranchMI << "\n";); - MachineBasicBlock::iterator I = BranchMI; - unsigned ImmReg = FuncRep->getRegInfo().createVirtualRegister(I32RC); - llvm_unreachable("Extra register needed to handle CFG"); - MachineInstr *NewMI = insertInstrBefore(I, AMDGPU::BRANCH_COND_i32); - MachineInstrBuilder MIB(*FuncRep, NewMI); - MIB.addMBB(LoopHeader); - MIB.addReg(ImmReg, false); - SHOWNEWINSTR(NewMI); - BranchMI->eraseFromParent(); - LoopLatch->addSuccessor(DummyExitBlk); - - return DummyExitBlk; -} - -void AMDGPUCFGStructurizer::removeUnconditionalBranch(MachineBasicBlock *MBB) { - MachineInstr *BranchMI; - - // I saw two unconditional branch in one basic block in example - // test_fc_do_while_or.c need to fix the upstream on this to remove the loop. - while ((BranchMI = getLoopendBlockBranchInstr(MBB)) - && isUncondBranch(BranchMI)) { - DEBUG(dbgs() << "Removing uncond branch instr"; BranchMI->dump();); - BranchMI->eraseFromParent(); - } -} - -void AMDGPUCFGStructurizer::removeRedundantConditionalBranch( - MachineBasicBlock *MBB) { - if (MBB->succ_size() != 2) - return; - MachineBasicBlock *MBB1 = *MBB->succ_begin(); - MachineBasicBlock *MBB2 = *std::next(MBB->succ_begin()); - if (MBB1 != MBB2) - return; - - MachineInstr *BranchMI = getNormalBlockBranchInstr(MBB); - assert(BranchMI && isCondBranch(BranchMI)); - DEBUG(dbgs() << "Removing unneeded cond branch instr"; BranchMI->dump();); - BranchMI->eraseFromParent(); - SHOWNEWBLK(MBB1, "Removing redundant successor"); - MBB->removeSuccessor(MBB1); -} - -void AMDGPUCFGStructurizer::addDummyExitBlock( - SmallVectorImpl &RetMBB) { - MachineBasicBlock *DummyExitBlk = FuncRep->CreateMachineBasicBlock(); - FuncRep->push_back(DummyExitBlk); //insert to function - insertInstrEnd(DummyExitBlk, AMDGPU::RETURN); - - for (SmallVectorImpl::iterator It = RetMBB.begin(), - E = RetMBB.end(); It != E; ++It) { - MachineBasicBlock *MBB = *It; - MachineInstr *MI = getReturnInstr(MBB); - if (MI) - MI->eraseFromParent(); - MBB->addSuccessor(DummyExitBlk); - DEBUG( - dbgs() << "Add dummyExitBlock to BB" << MBB->getNumber() - << " successors\n"; - ); - } - SHOWNEWBLK(DummyExitBlk, "DummyExitBlock: "); -} - -void AMDGPUCFGStructurizer::removeSuccessor(MachineBasicBlock *MBB) { - while (MBB->succ_size()) - MBB->removeSuccessor(*MBB->succ_begin()); -} - -void AMDGPUCFGStructurizer::recordSccnum(MachineBasicBlock *MBB, - int SccNum) { - BlockInformation *&srcBlkInfo = BlockInfoMap[MBB]; - if (!srcBlkInfo) - srcBlkInfo = new BlockInformation(); - srcBlkInfo->SccNum = SccNum; -} - -void AMDGPUCFGStructurizer::retireBlock(MachineBasicBlock *MBB) { - DEBUG( - dbgs() << "Retiring BB" << MBB->getNumber() << "\n"; - ); - - BlockInformation *&SrcBlkInfo = BlockInfoMap[MBB]; - - if (!SrcBlkInfo) - SrcBlkInfo = new BlockInformation(); - - SrcBlkInfo->IsRetired = true; - assert(MBB->succ_size() == 0 && MBB->pred_size() == 0 - && "can't retire block yet"); -} - -void AMDGPUCFGStructurizer::setLoopLandBlock(MachineLoop *loopRep, - MachineBasicBlock *MBB) { - MachineBasicBlock *&TheEntry = LLInfoMap[loopRep]; - if (!MBB) { - MBB = FuncRep->CreateMachineBasicBlock(); - FuncRep->push_back(MBB); //insert to function - SHOWNEWBLK(MBB, "DummyLandingBlock for loop without break: "); - } - TheEntry = MBB; - DEBUG( - dbgs() << "setLoopLandBlock loop-header = BB" - << loopRep->getHeader()->getNumber() - << " landing-block = BB" << MBB->getNumber() << "\n"; - ); -} - -MachineBasicBlock * -AMDGPUCFGStructurizer::findNearestCommonPostDom(MachineBasicBlock *MBB1, - MachineBasicBlock *MBB2) { - - if (PDT->dominates(MBB1, MBB2)) - return MBB1; - if (PDT->dominates(MBB2, MBB1)) - return MBB2; - - MachineDomTreeNode *Node1 = PDT->getNode(MBB1); - MachineDomTreeNode *Node2 = PDT->getNode(MBB2); - - // Handle newly cloned node. - if (!Node1 && MBB1->succ_size() == 1) - return findNearestCommonPostDom(*MBB1->succ_begin(), MBB2); - if (!Node2 && MBB2->succ_size() == 1) - return findNearestCommonPostDom(MBB1, *MBB2->succ_begin()); - - if (!Node1 || !Node2) - return nullptr; - - Node1 = Node1->getIDom(); - while (Node1) { - if (PDT->dominates(Node1, Node2)) - return Node1->getBlock(); - Node1 = Node1->getIDom(); - } - - return nullptr; -} - -MachineBasicBlock * -AMDGPUCFGStructurizer::findNearestCommonPostDom( - std::set &MBBs) { - MachineBasicBlock *CommonDom; - std::set::const_iterator It = MBBs.begin(); - std::set::const_iterator E = MBBs.end(); - for (CommonDom = *It; It != E && CommonDom; ++It) { - MachineBasicBlock *MBB = *It; - if (MBB != CommonDom) - CommonDom = findNearestCommonPostDom(MBB, CommonDom); - } - - DEBUG( - dbgs() << "Common post dominator for exit blocks is "; - if (CommonDom) - dbgs() << "BB" << CommonDom->getNumber() << "\n"; - else - dbgs() << "NULL\n"; - ); - - return CommonDom; -} - -char AMDGPUCFGStructurizer::ID = 0; - -} // end anonymous namespace - - -INITIALIZE_PASS_BEGIN(AMDGPUCFGStructurizer, "amdgpustructurizer", - "AMDGPU CFG Structurizer", false, false) -INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) -INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree) -INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) -INITIALIZE_PASS_END(AMDGPUCFGStructurizer, "amdgpustructurizer", - "AMDGPU CFG Structurizer", false, false) - -FunctionPass *llvm::createAMDGPUCFGStructurizerPass() { - return new AMDGPUCFGStructurizer(); -} Index: projects/clang370-import/contrib/llvm/lib/Target/R600/AMDGPUAsmPrinter.cpp =================================================================== --- projects/clang370-import/contrib/llvm/lib/Target/R600/AMDGPUAsmPrinter.cpp (revision 288077) +++ projects/clang370-import/contrib/llvm/lib/Target/R600/AMDGPUAsmPrinter.cpp (nonexistent) @@ -1,600 +0,0 @@ -//===-- AMDGPUAsmPrinter.cpp - AMDGPU Assebly printer --------------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -/// \file -/// -/// The AMDGPUAsmPrinter is used to print both assembly string and also binary -/// code. When passed an MCAsmStreamer it prints assembly and when passed -/// an MCObjectStreamer it outputs binary code. -// -//===----------------------------------------------------------------------===// -// - -#include "AMDGPUAsmPrinter.h" -#include "InstPrinter/AMDGPUInstPrinter.h" -#include "AMDGPU.h" -#include "AMDKernelCodeT.h" -#include "AMDGPUSubtarget.h" -#include "R600Defines.h" -#include "R600MachineFunctionInfo.h" -#include "R600RegisterInfo.h" -#include "SIDefines.h" -#include "SIMachineFunctionInfo.h" -#include "SIRegisterInfo.h" -#include "llvm/CodeGen/MachineFrameInfo.h" -#include "llvm/MC/MCContext.h" -#include "llvm/MC/MCSectionELF.h" -#include "llvm/MC/MCStreamer.h" -#include "llvm/Support/ELF.h" -#include "llvm/Support/MathExtras.h" -#include "llvm/Support/TargetRegistry.h" -#include "llvm/Target/TargetLoweringObjectFile.h" - -using namespace llvm; - -// TODO: This should get the default rounding mode from the kernel. We just set -// the default here, but this could change if the OpenCL rounding mode pragmas -// are used. -// -// The denormal mode here should match what is reported by the OpenCL runtime -// for the CL_FP_DENORM bit from CL_DEVICE_{HALF|SINGLE|DOUBLE}_FP_CONFIG, but -// can also be override to flush with the -cl-denorms-are-zero compiler flag. -// -// AMD OpenCL only sets flush none and reports CL_FP_DENORM for double -// precision, and leaves single precision to flush all and does not report -// CL_FP_DENORM for CL_DEVICE_SINGLE_FP_CONFIG. Mesa's OpenCL currently reports -// CL_FP_DENORM for both. -// -// FIXME: It seems some instructions do not support single precision denormals -// regardless of the mode (exp_*_f32, rcp_*_f32, rsq_*_f32, rsq_*f32, sqrt_f32, -// and sin_f32, cos_f32 on most parts). - -// We want to use these instructions, and using fp32 denormals also causes -// instructions to run at the double precision rate for the device so it's -// probably best to just report no single precision denormals. -static uint32_t getFPMode(const MachineFunction &F) { - const AMDGPUSubtarget& ST = F.getSubtarget(); - // TODO: Is there any real use for the flush in only / flush out only modes? - - uint32_t FP32Denormals = - ST.hasFP32Denormals() ? FP_DENORM_FLUSH_NONE : FP_DENORM_FLUSH_IN_FLUSH_OUT; - - uint32_t FP64Denormals = - ST.hasFP64Denormals() ? FP_DENORM_FLUSH_NONE : FP_DENORM_FLUSH_IN_FLUSH_OUT; - - return FP_ROUND_MODE_SP(FP_ROUND_ROUND_TO_NEAREST) | - FP_ROUND_MODE_DP(FP_ROUND_ROUND_TO_NEAREST) | - FP_DENORM_MODE_SP(FP32Denormals) | - FP_DENORM_MODE_DP(FP64Denormals); -} - -static AsmPrinter * -createAMDGPUAsmPrinterPass(TargetMachine &tm, - std::unique_ptr &&Streamer) { - return new AMDGPUAsmPrinter(tm, std::move(Streamer)); -} - -extern "C" void LLVMInitializeR600AsmPrinter() { - TargetRegistry::RegisterAsmPrinter(TheAMDGPUTarget, createAMDGPUAsmPrinterPass); - TargetRegistry::RegisterAsmPrinter(TheGCNTarget, createAMDGPUAsmPrinterPass); -} - -AMDGPUAsmPrinter::AMDGPUAsmPrinter(TargetMachine &TM, - std::unique_ptr Streamer) - : AsmPrinter(TM, std::move(Streamer)) {} - -void AMDGPUAsmPrinter::EmitEndOfAsmFile(Module &M) { - - // This label is used to mark the end of the .text section. - const TargetLoweringObjectFile &TLOF = getObjFileLowering(); - OutStreamer->SwitchSection(TLOF.getTextSection()); - MCSymbol *EndOfTextLabel = - OutContext.getOrCreateSymbol(StringRef(END_OF_TEXT_LABEL_NAME)); - OutStreamer->EmitLabel(EndOfTextLabel); -} - -bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { - - // The starting address of all shader programs must be 256 bytes aligned. - MF.setAlignment(8); - - SetupMachineFunction(MF); - - MCContext &Context = getObjFileLowering().getContext(); - MCSectionELF *ConfigSection = - Context.getELFSection(".AMDGPU.config", ELF::SHT_PROGBITS, 0); - OutStreamer->SwitchSection(ConfigSection); - - const AMDGPUSubtarget &STM = MF.getSubtarget(); - SIProgramInfo KernelInfo; - if (STM.isAmdHsaOS()) { - getSIProgramInfo(KernelInfo, MF); - EmitAmdKernelCodeT(MF, KernelInfo); - OutStreamer->EmitCodeAlignment(2 << (MF.getAlignment() - 1)); - } else if (STM.getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) { - getSIProgramInfo(KernelInfo, MF); - EmitProgramInfoSI(MF, KernelInfo); - } else { - EmitProgramInfoR600(MF); - } - - DisasmLines.clear(); - HexLines.clear(); - DisasmLineMaxLen = 0; - - EmitFunctionBody(); - - if (isVerbose()) { - MCSectionELF *CommentSection = - Context.getELFSection(".AMDGPU.csdata", ELF::SHT_PROGBITS, 0); - OutStreamer->SwitchSection(CommentSection); - - if (STM.getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) { - OutStreamer->emitRawComment(" Kernel info:", false); - OutStreamer->emitRawComment(" codeLenInByte = " + Twine(KernelInfo.CodeLen), - false); - OutStreamer->emitRawComment(" NumSgprs: " + Twine(KernelInfo.NumSGPR), - false); - OutStreamer->emitRawComment(" NumVgprs: " + Twine(KernelInfo.NumVGPR), - false); - OutStreamer->emitRawComment(" FloatMode: " + Twine(KernelInfo.FloatMode), - false); - OutStreamer->emitRawComment(" IeeeMode: " + Twine(KernelInfo.IEEEMode), - false); - OutStreamer->emitRawComment(" ScratchSize: " + Twine(KernelInfo.ScratchSize), - false); - } else { - R600MachineFunctionInfo *MFI = MF.getInfo(); - OutStreamer->emitRawComment( - Twine("SQ_PGM_RESOURCES:STACK_SIZE = " + Twine(MFI->StackSize))); - } - } - - if (STM.dumpCode()) { - - OutStreamer->SwitchSection( - Context.getELFSection(".AMDGPU.disasm", ELF::SHT_NOTE, 0)); - - for (size_t i = 0; i < DisasmLines.size(); ++i) { - std::string Comment(DisasmLineMaxLen - DisasmLines[i].size(), ' '); - Comment += " ; " + HexLines[i] + "\n"; - - OutStreamer->EmitBytes(StringRef(DisasmLines[i])); - OutStreamer->EmitBytes(StringRef(Comment)); - } - } - - return false; -} - -void AMDGPUAsmPrinter::EmitProgramInfoR600(const MachineFunction &MF) { - unsigned MaxGPR = 0; - bool killPixel = false; - const AMDGPUSubtarget &STM = MF.getSubtarget(); - const R600RegisterInfo *RI = - static_cast(STM.getRegisterInfo()); - const R600MachineFunctionInfo *MFI = MF.getInfo(); - - for (const MachineBasicBlock &MBB : MF) { - for (const MachineInstr &MI : MBB) { - if (MI.getOpcode() == AMDGPU::KILLGT) - killPixel = true; - unsigned numOperands = MI.getNumOperands(); - for (unsigned op_idx = 0; op_idx < numOperands; op_idx++) { - const MachineOperand &MO = MI.getOperand(op_idx); - if (!MO.isReg()) - continue; - unsigned HWReg = RI->getEncodingValue(MO.getReg()) & 0xff; - - // Register with value > 127 aren't GPR - if (HWReg > 127) - continue; - MaxGPR = std::max(MaxGPR, HWReg); - } - } - } - - unsigned RsrcReg; - if (STM.getGeneration() >= AMDGPUSubtarget::EVERGREEN) { - // Evergreen / Northern Islands - switch (MFI->getShaderType()) { - default: // Fall through - case ShaderType::COMPUTE: RsrcReg = R_0288D4_SQ_PGM_RESOURCES_LS; break; - case ShaderType::GEOMETRY: RsrcReg = R_028878_SQ_PGM_RESOURCES_GS; break; - case ShaderType::PIXEL: RsrcReg = R_028844_SQ_PGM_RESOURCES_PS; break; - case ShaderType::VERTEX: RsrcReg = R_028860_SQ_PGM_RESOURCES_VS; break; - } - } else { - // R600 / R700 - switch (MFI->getShaderType()) { - default: // Fall through - case ShaderType::GEOMETRY: // Fall through - case ShaderType::COMPUTE: // Fall through - case ShaderType::VERTEX: RsrcReg = R_028868_SQ_PGM_RESOURCES_VS; break; - case ShaderType::PIXEL: RsrcReg = R_028850_SQ_PGM_RESOURCES_PS; break; - } - } - - OutStreamer->EmitIntValue(RsrcReg, 4); - OutStreamer->EmitIntValue(S_NUM_GPRS(MaxGPR + 1) | - S_STACK_SIZE(MFI->StackSize), 4); - OutStreamer->EmitIntValue(R_02880C_DB_SHADER_CONTROL, 4); - OutStreamer->EmitIntValue(S_02880C_KILL_ENABLE(killPixel), 4); - - if (MFI->getShaderType() == ShaderType::COMPUTE) { - OutStreamer->EmitIntValue(R_0288E8_SQ_LDS_ALLOC, 4); - OutStreamer->EmitIntValue(RoundUpToAlignment(MFI->LDSSize, 4) >> 2, 4); - } -} - -void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, - const MachineFunction &MF) const { - const AMDGPUSubtarget &STM = MF.getSubtarget(); - const SIMachineFunctionInfo *MFI = MF.getInfo(); - uint64_t CodeSize = 0; - unsigned MaxSGPR = 0; - unsigned MaxVGPR = 0; - bool VCCUsed = false; - bool FlatUsed = false; - const SIRegisterInfo *RI = - static_cast(STM.getRegisterInfo()); - - for (const MachineBasicBlock &MBB : MF) { - for (const MachineInstr &MI : MBB) { - // TODO: CodeSize should account for multiple functions. - CodeSize += MI.getDesc().Size; - - unsigned numOperands = MI.getNumOperands(); - for (unsigned op_idx = 0; op_idx < numOperands; op_idx++) { - const MachineOperand &MO = MI.getOperand(op_idx); - unsigned width = 0; - bool isSGPR = false; - - if (!MO.isReg()) { - continue; - } - unsigned reg = MO.getReg(); - if (reg == AMDGPU::VCC || reg == AMDGPU::VCC_LO || - reg == AMDGPU::VCC_HI) { - VCCUsed = true; - continue; - } else if (reg == AMDGPU::FLAT_SCR || - reg == AMDGPU::FLAT_SCR_LO || - reg == AMDGPU::FLAT_SCR_HI) { - FlatUsed = true; - continue; - } - - switch (reg) { - default: break; - case AMDGPU::SCC: - case AMDGPU::EXEC: - case AMDGPU::M0: - continue; - } - - if (AMDGPU::SReg_32RegClass.contains(reg)) { - isSGPR = true; - width = 1; - } else if (AMDGPU::VGPR_32RegClass.contains(reg)) { - isSGPR = false; - width = 1; - } else if (AMDGPU::SReg_64RegClass.contains(reg)) { - isSGPR = true; - width = 2; - } else if (AMDGPU::VReg_64RegClass.contains(reg)) { - isSGPR = false; - width = 2; - } else if (AMDGPU::VReg_96RegClass.contains(reg)) { - isSGPR = false; - width = 3; - } else if (AMDGPU::SReg_128RegClass.contains(reg)) { - isSGPR = true; - width = 4; - } else if (AMDGPU::VReg_128RegClass.contains(reg)) { - isSGPR = false; - width = 4; - } else if (AMDGPU::SReg_256RegClass.contains(reg)) { - isSGPR = true; - width = 8; - } else if (AMDGPU::VReg_256RegClass.contains(reg)) { - isSGPR = false; - width = 8; - } else if (AMDGPU::SReg_512RegClass.contains(reg)) { - isSGPR = true; - width = 16; - } else if (AMDGPU::VReg_512RegClass.contains(reg)) { - isSGPR = false; - width = 16; - } else { - llvm_unreachable("Unknown register class"); - } - unsigned hwReg = RI->getEncodingValue(reg) & 0xff; - unsigned maxUsed = hwReg + width - 1; - if (isSGPR) { - MaxSGPR = maxUsed > MaxSGPR ? maxUsed : MaxSGPR; - } else { - MaxVGPR = maxUsed > MaxVGPR ? maxUsed : MaxVGPR; - } - } - } - } - - if (VCCUsed) - MaxSGPR += 2; - - if (FlatUsed) - MaxSGPR += 2; - - // We found the maximum register index. They start at 0, so add one to get the - // number of registers. - ProgInfo.NumVGPR = MaxVGPR + 1; - ProgInfo.NumSGPR = MaxSGPR + 1; - - if (STM.hasSGPRInitBug()) { - if (ProgInfo.NumSGPR > AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG) - llvm_unreachable("Too many SGPRs used with the SGPR init bug"); - - ProgInfo.NumSGPR = AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG; - } - - ProgInfo.VGPRBlocks = (ProgInfo.NumVGPR - 1) / 4; - ProgInfo.SGPRBlocks = (ProgInfo.NumSGPR - 1) / 8; - // Set the value to initialize FP_ROUND and FP_DENORM parts of the mode - // register. - ProgInfo.FloatMode = getFPMode(MF); - - // XXX: Not quite sure what this does, but sc seems to unset this. - ProgInfo.IEEEMode = 0; - - // Do not clamp NAN to 0. - ProgInfo.DX10Clamp = 0; - - const MachineFrameInfo *FrameInfo = MF.getFrameInfo(); - ProgInfo.ScratchSize = FrameInfo->estimateStackSize(MF); - - ProgInfo.FlatUsed = FlatUsed; - ProgInfo.VCCUsed = VCCUsed; - ProgInfo.CodeLen = CodeSize; - - unsigned LDSAlignShift; - if (STM.getGeneration() < AMDGPUSubtarget::SEA_ISLANDS) { - // LDS is allocated in 64 dword blocks. - LDSAlignShift = 8; - } else { - // LDS is allocated in 128 dword blocks. - LDSAlignShift = 9; - } - - unsigned LDSSpillSize = MFI->LDSWaveSpillSize * - MFI->getMaximumWorkGroupSize(MF); - - ProgInfo.LDSSize = MFI->LDSSize + LDSSpillSize; - ProgInfo.LDSBlocks = - RoundUpToAlignment(ProgInfo.LDSSize, 1 << LDSAlignShift) >> LDSAlignShift; - - // Scratch is allocated in 256 dword blocks. - unsigned ScratchAlignShift = 10; - // We need to program the hardware with the amount of scratch memory that - // is used by the entire wave. ProgInfo.ScratchSize is the amount of - // scratch memory used per thread. - ProgInfo.ScratchBlocks = - RoundUpToAlignment(ProgInfo.ScratchSize * STM.getWavefrontSize(), - 1 << ScratchAlignShift) >> ScratchAlignShift; - - ProgInfo.ComputePGMRSrc1 = - S_00B848_VGPRS(ProgInfo.VGPRBlocks) | - S_00B848_SGPRS(ProgInfo.SGPRBlocks) | - S_00B848_PRIORITY(ProgInfo.Priority) | - S_00B848_FLOAT_MODE(ProgInfo.FloatMode) | - S_00B848_PRIV(ProgInfo.Priv) | - S_00B848_DX10_CLAMP(ProgInfo.DX10Clamp) | - S_00B848_IEEE_MODE(ProgInfo.DebugMode) | - S_00B848_IEEE_MODE(ProgInfo.IEEEMode); - - ProgInfo.ComputePGMRSrc2 = - S_00B84C_SCRATCH_EN(ProgInfo.ScratchBlocks > 0) | - S_00B84C_USER_SGPR(MFI->NumUserSGPRs) | - S_00B84C_TGID_X_EN(1) | - S_00B84C_TGID_Y_EN(1) | - S_00B84C_TGID_Z_EN(1) | - S_00B84C_TG_SIZE_EN(1) | - S_00B84C_TIDIG_COMP_CNT(2) | - S_00B84C_LDS_SIZE(ProgInfo.LDSBlocks); -} - -static unsigned getRsrcReg(unsigned ShaderType) { - switch (ShaderType) { - default: // Fall through - case ShaderType::COMPUTE: return R_00B848_COMPUTE_PGM_RSRC1; - case ShaderType::GEOMETRY: return R_00B228_SPI_SHADER_PGM_RSRC1_GS; - case ShaderType::PIXEL: return R_00B028_SPI_SHADER_PGM_RSRC1_PS; - case ShaderType::VERTEX: return R_00B128_SPI_SHADER_PGM_RSRC1_VS; - } -} - -void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF, - const SIProgramInfo &KernelInfo) { - const AMDGPUSubtarget &STM = MF.getSubtarget(); - const SIMachineFunctionInfo *MFI = MF.getInfo(); - unsigned RsrcReg = getRsrcReg(MFI->getShaderType()); - - if (MFI->getShaderType() == ShaderType::COMPUTE) { - OutStreamer->EmitIntValue(R_00B848_COMPUTE_PGM_RSRC1, 4); - - OutStreamer->EmitIntValue(KernelInfo.ComputePGMRSrc1, 4); - - OutStreamer->EmitIntValue(R_00B84C_COMPUTE_PGM_RSRC2, 4); - OutStreamer->EmitIntValue(KernelInfo.ComputePGMRSrc2, 4); - - OutStreamer->EmitIntValue(R_00B860_COMPUTE_TMPRING_SIZE, 4); - OutStreamer->EmitIntValue(S_00B860_WAVESIZE(KernelInfo.ScratchBlocks), 4); - - // TODO: Should probably note flat usage somewhere. SC emits a "FlatPtr32 = - // 0" comment but I don't see a corresponding field in the register spec. - } else { - OutStreamer->EmitIntValue(RsrcReg, 4); - OutStreamer->EmitIntValue(S_00B028_VGPRS(KernelInfo.VGPRBlocks) | - S_00B028_SGPRS(KernelInfo.SGPRBlocks), 4); - if (STM.isVGPRSpillingEnabled(MFI)) { - OutStreamer->EmitIntValue(R_0286E8_SPI_TMPRING_SIZE, 4); - OutStreamer->EmitIntValue(S_0286E8_WAVESIZE(KernelInfo.ScratchBlocks), 4); - } - } - - if (MFI->getShaderType() == ShaderType::PIXEL) { - OutStreamer->EmitIntValue(R_00B02C_SPI_SHADER_PGM_RSRC2_PS, 4); - OutStreamer->EmitIntValue(S_00B02C_EXTRA_LDS_SIZE(KernelInfo.LDSBlocks), 4); - OutStreamer->EmitIntValue(R_0286CC_SPI_PS_INPUT_ENA, 4); - OutStreamer->EmitIntValue(MFI->PSInputAddr, 4); - } -} - -void AMDGPUAsmPrinter::EmitAmdKernelCodeT(const MachineFunction &MF, - const SIProgramInfo &KernelInfo) const { - const SIMachineFunctionInfo *MFI = MF.getInfo(); - const AMDGPUSubtarget &STM = MF.getSubtarget(); - amd_kernel_code_t header; - - memset(&header, 0, sizeof(header)); - - header.amd_code_version_major = AMD_CODE_VERSION_MAJOR; - header.amd_code_version_minor = AMD_CODE_VERSION_MINOR; - - header.struct_byte_size = sizeof(amd_kernel_code_t); - - header.target_chip = STM.getAmdKernelCodeChipID(); - - header.kernel_code_entry_byte_offset = (1ULL << MF.getAlignment()); - - header.compute_pgm_resource_registers = - KernelInfo.ComputePGMRSrc1 | - (KernelInfo.ComputePGMRSrc2 << 32); - - // Code Properties: - header.code_properties = AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR | - AMD_CODE_PROPERTY_IS_PTR64; - - if (KernelInfo.FlatUsed) - header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT; - - if (KernelInfo.ScratchBlocks) - header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE; - - header.workitem_private_segment_byte_size = KernelInfo.ScratchSize; - header.workgroup_group_segment_byte_size = KernelInfo.LDSSize; - - // MFI->ABIArgOffset is the number of bytes for the kernel arguments - // plus 36. 36 is the number of bytes reserved at the begining of the - // input buffer to store work-group size information. - // FIXME: We should be adding the size of the implicit arguments - // to this value. - header.kernarg_segment_byte_size = MFI->ABIArgOffset; - - header.wavefront_sgpr_count = KernelInfo.NumSGPR; - header.workitem_vgpr_count = KernelInfo.NumVGPR; - - // FIXME: What values do I put for these alignments - header.kernarg_segment_alignment = 0; - header.group_segment_alignment = 0; - header.private_segment_alignment = 0; - - header.code_type = 1; // HSA_EXT_CODE_KERNEL - - header.wavefront_size = STM.getWavefrontSize(); - - MCSectionELF *VersionSection = - OutContext.getELFSection(".hsa.version", ELF::SHT_PROGBITS, 0); - OutStreamer->SwitchSection(VersionSection); - OutStreamer->EmitBytes(Twine("HSA Code Unit:" + - Twine(header.hsail_version_major) + "." + - Twine(header.hsail_version_minor) + ":" + - "AMD:" + - Twine(header.amd_code_version_major) + "." + - Twine(header.amd_code_version_minor) + ":" + - "GFX8.1:0").str()); - - OutStreamer->SwitchSection(getObjFileLowering().getTextSection()); - - if (isVerbose()) { - OutStreamer->emitRawComment("amd_code_version_major = " + - Twine(header.amd_code_version_major), false); - OutStreamer->emitRawComment("amd_code_version_minor = " + - Twine(header.amd_code_version_minor), false); - OutStreamer->emitRawComment("struct_byte_size = " + - Twine(header.struct_byte_size), false); - OutStreamer->emitRawComment("target_chip = " + - Twine(header.target_chip), false); - OutStreamer->emitRawComment(" compute_pgm_rsrc1: " + - Twine::utohexstr(KernelInfo.ComputePGMRSrc1), - false); - OutStreamer->emitRawComment(" compute_pgm_rsrc2: " + - Twine::utohexstr(KernelInfo.ComputePGMRSrc2), - false); - OutStreamer->emitRawComment("enable_sgpr_private_segment_buffer = " + - Twine((bool)(header.code_properties & - AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE)), false); - OutStreamer->emitRawComment("enable_sgpr_kernarg_segment_ptr = " + - Twine((bool)(header.code_properties & - AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR)), false); - OutStreamer->emitRawComment("private_element_size = 2 ", false); - OutStreamer->emitRawComment("is_ptr64 = " + - Twine((bool)(header.code_properties & AMD_CODE_PROPERTY_IS_PTR64)), false); - OutStreamer->emitRawComment("workitem_private_segment_byte_size = " + - Twine(header.workitem_private_segment_byte_size), - false); - OutStreamer->emitRawComment("workgroup_group_segment_byte_size = " + - Twine(header.workgroup_group_segment_byte_size), - false); - OutStreamer->emitRawComment("gds_segment_byte_size = " + - Twine(header.gds_segment_byte_size), false); - OutStreamer->emitRawComment("kernarg_segment_byte_size = " + - Twine(header.kernarg_segment_byte_size), false); - OutStreamer->emitRawComment("wavefront_sgpr_count = " + - Twine(header.wavefront_sgpr_count), false); - OutStreamer->emitRawComment("workitem_vgpr_count = " + - Twine(header.workitem_vgpr_count), false); - OutStreamer->emitRawComment("code_type = " + Twine(header.code_type), false); - OutStreamer->emitRawComment("wavefront_size = " + - Twine((int)header.wavefront_size), false); - OutStreamer->emitRawComment("optimization_level = " + - Twine(header.optimization_level), false); - OutStreamer->emitRawComment("hsail_profile = " + - Twine(header.hsail_profile), false); - OutStreamer->emitRawComment("hsail_machine_model = " + - Twine(header.hsail_machine_model), false); - OutStreamer->emitRawComment("hsail_version_major = " + - Twine(header.hsail_version_major), false); - OutStreamer->emitRawComment("hsail_version_minor = " + - Twine(header.hsail_version_minor), false); - } - - OutStreamer->EmitBytes(StringRef((char*)&header, sizeof(header))); -} - -bool AMDGPUAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, - unsigned AsmVariant, - const char *ExtraCode, raw_ostream &O) { - if (ExtraCode && ExtraCode[0]) { - if (ExtraCode[1] != 0) - return true; // Unknown modifier. - - switch (ExtraCode[0]) { - default: - // See if this is a generic print operand - return AsmPrinter::PrintAsmOperand(MI, OpNo, AsmVariant, ExtraCode, O); - case 'r': - break; - } - } - - AMDGPUInstPrinter::printRegOperand(MI->getOperand(OpNo).getReg(), O, - *TM.getSubtargetImpl(*MF->getFunction())->getRegisterInfo()); - return false; -}