Page MenuHomeFreeBSD

No OneTemporary

This file is larger than 256 KB, so syntax highlighting was skipped.
Index: projects/clang400-import/contrib/compiler-rt
===================================================================
--- projects/clang400-import/contrib/compiler-rt (revision 314268)
+++ projects/clang400-import/contrib/compiler-rt (revision 314269)
Property changes on: projects/clang400-import/contrib/compiler-rt
___________________________________________________________________
Modified: svn:mergeinfo
## -0,0 +0,1 ##
Merged /vendor/compiler-rt/dist:r314177-314268
Index: projects/clang400-import/contrib/libc++
===================================================================
--- projects/clang400-import/contrib/libc++ (revision 314268)
+++ projects/clang400-import/contrib/libc++ (revision 314269)
Property changes on: projects/clang400-import/contrib/libc++
___________________________________________________________________
Modified: svn:mergeinfo
## -0,0 +0,1 ##
Merged /vendor/libc++/dist:r314177-314268
Index: projects/clang400-import/contrib/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h
===================================================================
--- projects/clang400-import/contrib/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h (revision 314268)
+++ projects/clang400-import/contrib/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h (revision 314269)
@@ -1,118 +1,112 @@
//===---- SLPVectorizer.h ---------------------------------------*- C++ -*-===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
// This pass implements the Bottom Up SLP vectorizer. It detects consecutive
// stores that can be put together into vector-stores. Next, it attempts to
// construct vectorizable tree using the use-def chains. If a profitable tree
// was found, the SLP vectorizer performs vectorization on the tree.
//
// The pass is inspired by the work described in the paper:
// "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks.
//
//===----------------------------------------------------------------------===//
#ifndef LLVM_TRANSFORMS_VECTORIZE_SLPVECTORIZER_H
#define LLVM_TRANSFORMS_VECTORIZE_SLPVECTORIZER_H
#include "llvm/ADT/MapVector.h"
#include "llvm/Analysis/AliasAnalysis.h"
#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/Analysis/DemandedBits.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/PassManager.h"
namespace llvm {
/// A private "module" namespace for types and utilities used by this pass.
/// These are implementation details and should not be used by clients.
namespace slpvectorizer {
class BoUpSLP;
}
struct SLPVectorizerPass : public PassInfoMixin<SLPVectorizerPass> {
typedef SmallVector<StoreInst *, 8> StoreList;
typedef MapVector<Value *, StoreList> StoreListMap;
typedef SmallVector<WeakVH, 8> WeakVHList;
typedef MapVector<Value *, WeakVHList> WeakVHListMap;
ScalarEvolution *SE = nullptr;
TargetTransformInfo *TTI = nullptr;
TargetLibraryInfo *TLI = nullptr;
AliasAnalysis *AA = nullptr;
LoopInfo *LI = nullptr;
DominatorTree *DT = nullptr;
AssumptionCache *AC = nullptr;
DemandedBits *DB = nullptr;
const DataLayout *DL = nullptr;
public:
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
// Glue for old PM.
bool runImpl(Function &F, ScalarEvolution *SE_, TargetTransformInfo *TTI_,
TargetLibraryInfo *TLI_, AliasAnalysis *AA_, LoopInfo *LI_,
DominatorTree *DT_, AssumptionCache *AC_, DemandedBits *DB_);
private:
/// \brief Collect store and getelementptr instructions and organize them
/// according to the underlying object of their pointer operands. We sort the
/// instructions by their underlying objects to reduce the cost of
/// consecutive access queries.
///
/// TODO: We can further reduce this cost if we flush the chain creation
/// every time we run into a memory barrier.
void collectSeedInstructions(BasicBlock *BB);
/// \brief Try to vectorize a chain that starts at two arithmetic instrs.
bool tryToVectorizePair(Value *A, Value *B, slpvectorizer::BoUpSLP &R);
/// \brief Try to vectorize a list of operands.
/// \@param BuildVector A list of users to ignore for the purpose of
/// scheduling and that don't need extracting.
/// \returns true if a value was vectorized.
bool tryToVectorizeList(ArrayRef<Value *> VL, slpvectorizer::BoUpSLP &R,
ArrayRef<Value *> BuildVector = None,
bool AllowReorder = false);
/// \brief Try to vectorize a chain that may start at the operands of \V;
bool tryToVectorize(BinaryOperator *V, slpvectorizer::BoUpSLP &R);
/// \brief Vectorize the store instructions collected in Stores.
bool vectorizeStoreChains(slpvectorizer::BoUpSLP &R);
/// \brief Vectorize the index computations of the getelementptr instructions
/// collected in GEPs.
bool vectorizeGEPIndices(BasicBlock *BB, slpvectorizer::BoUpSLP &R);
- /// Try to find horizontal reduction or otherwise vectorize a chain of binary
- /// operators.
- bool vectorizeRootInstruction(PHINode *P, Value *V, BasicBlock *BB,
- slpvectorizer::BoUpSLP &R,
- TargetTransformInfo *TTI);
-
/// \brief Scan the basic block and look for patterns that are likely to start
/// a vectorization chain.
bool vectorizeChainsInBlock(BasicBlock *BB, slpvectorizer::BoUpSLP &R);
bool vectorizeStoreChain(ArrayRef<Value *> Chain, slpvectorizer::BoUpSLP &R,
unsigned VecRegSize);
bool vectorizeStores(ArrayRef<StoreInst *> Stores, slpvectorizer::BoUpSLP &R);
/// The store instructions in a basic block organized by base pointer.
StoreListMap Stores;
/// The getelementptr instructions in a basic block organized by base pointer.
WeakVHListMap GEPs;
};
}
#endif // LLVM_TRANSFORMS_VECTORIZE_SLPVECTORIZER_H
Index: projects/clang400-import/contrib/llvm/lib/Target/AMDGPU/SIInstructions.td
===================================================================
--- projects/clang400-import/contrib/llvm/lib/Target/AMDGPU/SIInstructions.td (revision 314268)
+++ projects/clang400-import/contrib/llvm/lib/Target/AMDGPU/SIInstructions.td (revision 314269)
@@ -1,1094 +1,1099 @@
//===-- SIInstructions.td - SI Instruction Defintions ---------------------===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
// This file was originally auto-generated from a GPU register header file and
// all the instruction definitions were originally commented out. Instructions
// that are not yet supported remain commented out.
//===----------------------------------------------------------------------===//
def isGCN : Predicate<"Subtarget->getGeneration() "
">= SISubtarget::SOUTHERN_ISLANDS">,
AssemblerPredicate<"FeatureGCN">;
def isSI : Predicate<"Subtarget->getGeneration() "
"== SISubtarget::SOUTHERN_ISLANDS">,
AssemblerPredicate<"FeatureSouthernIslands">;
def has16BankLDS : Predicate<"Subtarget->getLDSBankCount() == 16">;
def has32BankLDS : Predicate<"Subtarget->getLDSBankCount() == 32">;
def HasVGPRIndexMode : Predicate<"Subtarget->hasVGPRIndexMode()">,
AssemblerPredicate<"FeatureVGPRIndexMode">;
def HasMovrel : Predicate<"Subtarget->hasMovrel()">,
AssemblerPredicate<"FeatureMovrel">;
include "VOPInstructions.td"
include "SOPInstructions.td"
include "SMInstructions.td"
include "FLATInstructions.td"
include "BUFInstructions.td"
let SubtargetPredicate = isGCN in {
//===----------------------------------------------------------------------===//
// EXP Instructions
//===----------------------------------------------------------------------===//
defm EXP : EXP_m<0, AMDGPUexport>;
defm EXP_DONE : EXP_m<1, AMDGPUexport_done>;
//===----------------------------------------------------------------------===//
// VINTRP Instructions
//===----------------------------------------------------------------------===//
let Uses = [M0, EXEC] in {
// FIXME: Specify SchedRW for VINTRP insturctions.
multiclass V_INTERP_P1_F32_m : VINTRP_m <
0x00000000,
(outs VGPR_32:$vdst),
(ins VGPR_32:$vsrc, Attr:$attr, AttrChan:$attrchan),
"v_interp_p1_f32 $vdst, $vsrc, $attr$attrchan",
[(set f32:$vdst, (AMDGPUinterp_p1 f32:$vsrc, (i32 imm:$attrchan),
(i32 imm:$attr)))]
>;
let OtherPredicates = [has32BankLDS] in {
defm V_INTERP_P1_F32 : V_INTERP_P1_F32_m;
} // End OtherPredicates = [has32BankLDS]
let OtherPredicates = [has16BankLDS], Constraints = "@earlyclobber $vdst", isAsmParserOnly=1 in {
defm V_INTERP_P1_F32_16bank : V_INTERP_P1_F32_m;
} // End OtherPredicates = [has32BankLDS], Constraints = "@earlyclobber $vdst", isAsmParserOnly=1
let DisableEncoding = "$src0", Constraints = "$src0 = $vdst" in {
defm V_INTERP_P2_F32 : VINTRP_m <
0x00000001,
(outs VGPR_32:$vdst),
(ins VGPR_32:$src0, VGPR_32:$vsrc, Attr:$attr, AttrChan:$attrchan),
"v_interp_p2_f32 $vdst, $vsrc, $attr$attrchan",
[(set f32:$vdst, (AMDGPUinterp_p2 f32:$src0, f32:$vsrc, (i32 imm:$attrchan),
(i32 imm:$attr)))]>;
} // End DisableEncoding = "$src0", Constraints = "$src0 = $vdst"
defm V_INTERP_MOV_F32 : VINTRP_m <
0x00000002,
(outs VGPR_32:$vdst),
(ins InterpSlot:$vsrc, Attr:$attr, AttrChan:$attrchan),
"v_interp_mov_f32 $vdst, $vsrc, $attr$attrchan",
[(set f32:$vdst, (AMDGPUinterp_mov (i32 imm:$vsrc), (i32 imm:$attrchan),
(i32 imm:$attr)))]>;
} // End Uses = [M0, EXEC]
//===----------------------------------------------------------------------===//
// Pseudo Instructions
//===----------------------------------------------------------------------===//
let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] in {
// For use in patterns
def V_CNDMASK_B64_PSEUDO : VOP3Common <(outs VReg_64:$vdst),
(ins VSrc_b64:$src0, VSrc_b64:$src1, SSrc_b64:$src2), "", []> {
let isPseudo = 1;
let isCodeGenOnly = 1;
let usesCustomInserter = 1;
}
// 64-bit vector move instruction. This is mainly used by the SIFoldOperands
// pass to enable folding of inline immediates.
def V_MOV_B64_PSEUDO : VPseudoInstSI <(outs VReg_64:$vdst),
(ins VSrc_b64:$src0)>;
} // End let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC]
let usesCustomInserter = 1, SALU = 1 in {
def GET_GROUPSTATICSIZE : PseudoInstSI <(outs SReg_32:$sdst), (ins),
[(set SReg_32:$sdst, (int_amdgcn_groupstaticsize))]>;
} // End let usesCustomInserter = 1, SALU = 1
def S_MOV_B64_term : PseudoInstSI<(outs SReg_64:$dst),
(ins SSrc_b64:$src0)> {
let SALU = 1;
let isAsCheapAsAMove = 1;
let isTerminator = 1;
}
def S_XOR_B64_term : PseudoInstSI<(outs SReg_64:$dst),
(ins SSrc_b64:$src0, SSrc_b64:$src1)> {
let SALU = 1;
let isAsCheapAsAMove = 1;
let isTerminator = 1;
}
def S_ANDN2_B64_term : PseudoInstSI<(outs SReg_64:$dst),
(ins SSrc_b64:$src0, SSrc_b64:$src1)> {
let SALU = 1;
let isAsCheapAsAMove = 1;
let isTerminator = 1;
}
def WAVE_BARRIER : SPseudoInstSI<(outs), (ins),
[(int_amdgcn_wave_barrier)]> {
let SchedRW = [];
let hasNoSchedulingInfo = 1;
let hasSideEffects = 1;
let mayLoad = 1;
let mayStore = 1;
let isBarrier = 1;
let isConvergent = 1;
}
// SI pseudo instructions. These are used by the CFG structurizer pass
// and should be lowered to ISA instructions prior to codegen.
// Dummy terminator instruction to use after control flow instructions
// replaced with exec mask operations.
def SI_MASK_BRANCH : PseudoInstSI <
(outs), (ins brtarget:$target)> {
let isBranch = 0;
let isTerminator = 1;
let isBarrier = 0;
let Uses = [EXEC];
let SchedRW = [];
let hasNoSchedulingInfo = 1;
}
let isTerminator = 1 in {
def SI_IF: CFPseudoInstSI <
(outs SReg_64:$dst), (ins SReg_64:$vcc, brtarget:$target),
[(set i64:$dst, (int_amdgcn_if i1:$vcc, bb:$target))], 1, 1> {
let Constraints = "";
let Size = 12;
let mayLoad = 1;
let mayStore = 1;
let hasSideEffects = 1;
}
def SI_ELSE : CFPseudoInstSI <
(outs SReg_64:$dst), (ins SReg_64:$src, brtarget:$target, i1imm:$execfix), [], 1, 1> {
let Constraints = "$src = $dst";
let Size = 12;
let mayStore = 1;
let mayLoad = 1;
let hasSideEffects = 1;
}
def SI_LOOP : CFPseudoInstSI <
(outs), (ins SReg_64:$saved, brtarget:$target),
[(int_amdgcn_loop i64:$saved, bb:$target)], 1, 1> {
let Size = 8;
let isBranch = 1;
let hasSideEffects = 1;
let mayLoad = 1;
let mayStore = 1;
}
} // End isBranch = 1, isTerminator = 1
def SI_END_CF : CFPseudoInstSI <
(outs), (ins SReg_64:$saved),
[(int_amdgcn_end_cf i64:$saved)], 1, 1> {
let Size = 4;
let isAsCheapAsAMove = 1;
let isReMaterializable = 1;
let mayLoad = 1;
let mayStore = 1;
let hasSideEffects = 1;
}
def SI_BREAK : CFPseudoInstSI <
(outs SReg_64:$dst), (ins SReg_64:$src),
[(set i64:$dst, (int_amdgcn_break i64:$src))], 1> {
let Size = 4;
let isAsCheapAsAMove = 1;
let isReMaterializable = 1;
}
def SI_IF_BREAK : CFPseudoInstSI <
(outs SReg_64:$dst), (ins SReg_64:$vcc, SReg_64:$src),
[(set i64:$dst, (int_amdgcn_if_break i1:$vcc, i64:$src))]> {
let Size = 4;
let isAsCheapAsAMove = 1;
let isReMaterializable = 1;
}
def SI_ELSE_BREAK : CFPseudoInstSI <
(outs SReg_64:$dst), (ins SReg_64:$src0, SReg_64:$src1),
[(set i64:$dst, (int_amdgcn_else_break i64:$src0, i64:$src1))]> {
let Size = 4;
let isAsCheapAsAMove = 1;
let isReMaterializable = 1;
}
let Uses = [EXEC], Defs = [EXEC,VCC] in {
def SI_KILL : PseudoInstSI <
(outs), (ins VSrc_b32:$src),
[(AMDGPUkill i32:$src)]> {
let isConvergent = 1;
let usesCustomInserter = 1;
}
def SI_KILL_TERMINATOR : SPseudoInstSI <
(outs), (ins VSrc_b32:$src)> {
let isTerminator = 1;
}
} // End Uses = [EXEC], Defs = [EXEC,VCC]
// Branch on undef scc. Used to avoid intermediate copy from
// IMPLICIT_DEF to SCC.
def SI_BR_UNDEF : SPseudoInstSI <(outs), (ins sopp_brtarget:$simm16)> {
let isTerminator = 1;
let usesCustomInserter = 1;
}
def SI_PS_LIVE : PseudoInstSI <
(outs SReg_64:$dst), (ins),
[(set i1:$dst, (int_amdgcn_ps_live))]> {
let SALU = 1;
}
// Used as an isel pseudo to directly emit initialization with an
// s_mov_b32 rather than a copy of another initialized
// register. MachineCSE skips copies, and we don't want to have to
// fold operands before it runs.
def SI_INIT_M0 : SPseudoInstSI <(outs), (ins SSrc_b32:$src)> {
let Defs = [M0];
let usesCustomInserter = 1;
let isAsCheapAsAMove = 1;
let isReMaterializable = 1;
}
def SI_RETURN : SPseudoInstSI <
(outs), (ins variable_ops), [(AMDGPUreturn)]> {
let isTerminator = 1;
let isBarrier = 1;
let isReturn = 1;
let hasSideEffects = 1;
let hasNoSchedulingInfo = 1;
let DisableWQM = 1;
}
let Defs = [M0, EXEC],
UseNamedOperandTable = 1 in {
class SI_INDIRECT_SRC<RegisterClass rc> : VPseudoInstSI <
(outs VGPR_32:$vdst),
(ins rc:$src, VS_32:$idx, i32imm:$offset)> {
let usesCustomInserter = 1;
}
class SI_INDIRECT_DST<RegisterClass rc> : VPseudoInstSI <
(outs rc:$vdst),
(ins rc:$src, VS_32:$idx, i32imm:$offset, VGPR_32:$val)> {
let Constraints = "$src = $vdst";
let usesCustomInserter = 1;
}
// TODO: We can support indirect SGPR access.
def SI_INDIRECT_SRC_V1 : SI_INDIRECT_SRC<VGPR_32>;
def SI_INDIRECT_SRC_V2 : SI_INDIRECT_SRC<VReg_64>;
def SI_INDIRECT_SRC_V4 : SI_INDIRECT_SRC<VReg_128>;
def SI_INDIRECT_SRC_V8 : SI_INDIRECT_SRC<VReg_256>;
def SI_INDIRECT_SRC_V16 : SI_INDIRECT_SRC<VReg_512>;
def SI_INDIRECT_DST_V1 : SI_INDIRECT_DST<VGPR_32>;
def SI_INDIRECT_DST_V2 : SI_INDIRECT_DST<VReg_64>;
def SI_INDIRECT_DST_V4 : SI_INDIRECT_DST<VReg_128>;
def SI_INDIRECT_DST_V8 : SI_INDIRECT_DST<VReg_256>;
def SI_INDIRECT_DST_V16 : SI_INDIRECT_DST<VReg_512>;
} // End Uses = [EXEC], Defs = [M0, EXEC]
multiclass SI_SPILL_SGPR <RegisterClass sgpr_class> {
let UseNamedOperandTable = 1, SGPRSpill = 1, Uses = [EXEC] in {
def _SAVE : PseudoInstSI <
(outs),
(ins sgpr_class:$data, i32imm:$addr)> {
let mayStore = 1;
let mayLoad = 0;
}
def _RESTORE : PseudoInstSI <
(outs sgpr_class:$data),
(ins i32imm:$addr)> {
let mayStore = 0;
let mayLoad = 1;
}
} // End UseNamedOperandTable = 1
}
// You cannot use M0 as the output of v_readlane_b32 instructions or
// use it in the sdata operand of SMEM instructions. We still need to
// be able to spill the physical register m0, so allow it for
// SI_SPILL_32_* instructions.
defm SI_SPILL_S32 : SI_SPILL_SGPR <SReg_32>;
defm SI_SPILL_S64 : SI_SPILL_SGPR <SReg_64>;
defm SI_SPILL_S128 : SI_SPILL_SGPR <SReg_128>;
defm SI_SPILL_S256 : SI_SPILL_SGPR <SReg_256>;
defm SI_SPILL_S512 : SI_SPILL_SGPR <SReg_512>;
multiclass SI_SPILL_VGPR <RegisterClass vgpr_class> {
let UseNamedOperandTable = 1, VGPRSpill = 1,
SchedRW = [WriteVMEM] in {
def _SAVE : VPseudoInstSI <
(outs),
(ins vgpr_class:$vdata, i32imm:$vaddr, SReg_128:$srsrc,
SReg_32:$soffset, i32imm:$offset)> {
let mayStore = 1;
let mayLoad = 0;
// (2 * 4) + (8 * num_subregs) bytes maximum
let Size = !add(!shl(!srl(vgpr_class.Size, 5), 3), 8);
}
def _RESTORE : VPseudoInstSI <
(outs vgpr_class:$vdata),
(ins i32imm:$vaddr, SReg_128:$srsrc, SReg_32:$soffset,
i32imm:$offset)> {
let mayStore = 0;
let mayLoad = 1;
// (2 * 4) + (8 * num_subregs) bytes maximum
let Size = !add(!shl(!srl(vgpr_class.Size, 5), 3), 8);
}
} // End UseNamedOperandTable = 1, VGPRSpill = 1, SchedRW = [WriteVMEM]
}
defm SI_SPILL_V32 : SI_SPILL_VGPR <VGPR_32>;
defm SI_SPILL_V64 : SI_SPILL_VGPR <VReg_64>;
defm SI_SPILL_V96 : SI_SPILL_VGPR <VReg_96>;
defm SI_SPILL_V128 : SI_SPILL_VGPR <VReg_128>;
defm SI_SPILL_V256 : SI_SPILL_VGPR <VReg_256>;
defm SI_SPILL_V512 : SI_SPILL_VGPR <VReg_512>;
def SI_PC_ADD_REL_OFFSET : SPseudoInstSI <
(outs SReg_64:$dst),
(ins si_ga:$ptr_lo, si_ga:$ptr_hi),
[(set SReg_64:$dst,
(i64 (SIpc_add_rel_offset (tglobaladdr:$ptr_lo), (tglobaladdr:$ptr_hi))))]> {
let Defs = [SCC];
}
} // End SubtargetPredicate = isGCN
let Predicates = [isGCN] in {
def : Pat<
(int_amdgcn_else i64:$src, bb:$target),
(SI_ELSE $src, $target, 0)
>;
def : Pat <
(int_AMDGPU_kilp),
(SI_KILL (i32 0xbf800000))
>;
//===----------------------------------------------------------------------===//
// VOP1 Patterns
//===----------------------------------------------------------------------===//
let Predicates = [UnsafeFPMath] in {
//def : RcpPat<V_RCP_F64_e32, f64>;
//defm : RsqPat<V_RSQ_F64_e32, f64>;
//defm : RsqPat<V_RSQ_F32_e32, f32>;
def : RsqPat<V_RSQ_F32_e32, f32>;
def : RsqPat<V_RSQ_F64_e32, f64>;
// Convert (x - floor(x)) to fract(x)
def : Pat <
(f32 (fsub (f32 (VOP3Mods f32:$x, i32:$mods)),
(f32 (ffloor (f32 (VOP3Mods f32:$x, i32:$mods)))))),
(V_FRACT_F32_e64 $mods, $x, DSTCLAMP.NONE, DSTOMOD.NONE)
>;
// Convert (x + (-floor(x))) to fract(x)
def : Pat <
(f64 (fadd (f64 (VOP3Mods f64:$x, i32:$mods)),
(f64 (fneg (f64 (ffloor (f64 (VOP3Mods f64:$x, i32:$mods)))))))),
(V_FRACT_F64_e64 $mods, $x, DSTCLAMP.NONE, DSTOMOD.NONE)
>;
} // End Predicates = [UnsafeFPMath]
def : Pat <
(f32 (fpextend f16:$src)),
(V_CVT_F32_F16_e32 $src)
>;
def : Pat <
(f64 (fpextend f16:$src)),
(V_CVT_F64_F32_e32 (V_CVT_F32_F16_e32 $src))
>;
def : Pat <
(f16 (fpround f32:$src)),
(V_CVT_F16_F32_e32 $src)
>;
def : Pat <
(f16 (fpround f64:$src)),
(V_CVT_F16_F32_e32 (V_CVT_F32_F64_e32 $src))
>;
def : Pat <
(i32 (fp_to_sint f16:$src)),
(V_CVT_I32_F32_e32 (V_CVT_F32_F16_e32 $src))
>;
def : Pat <
(i32 (fp_to_uint f16:$src)),
(V_CVT_U32_F32_e32 (V_CVT_F32_F16_e32 $src))
>;
def : Pat <
(f16 (sint_to_fp i32:$src)),
(V_CVT_F16_F32_e32 (V_CVT_F32_I32_e32 $src))
>;
def : Pat <
(f16 (uint_to_fp i32:$src)),
(V_CVT_F16_F32_e32 (V_CVT_F32_U32_e32 $src))
>;
//===----------------------------------------------------------------------===//
// VOP2 Patterns
//===----------------------------------------------------------------------===//
multiclass FMADPat <ValueType vt, Instruction inst> {
def : Pat <
(vt (fmad (VOP3NoMods0 vt:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod),
(VOP3NoMods vt:$src1, i32:$src1_modifiers),
(VOP3NoMods vt:$src2, i32:$src2_modifiers))),
(inst $src0_modifiers, $src0, $src1_modifiers, $src1,
$src2_modifiers, $src2, $clamp, $omod)
>;
}
defm : FMADPat <f16, V_MAC_F16_e64>;
defm : FMADPat <f32, V_MAC_F32_e64>;
multiclass SelectPat <ValueType vt, Instruction inst> {
def : Pat <
(vt (select i1:$src0, vt:$src1, vt:$src2)),
(inst $src2, $src1, $src0)
>;
}
defm : SelectPat <i16, V_CNDMASK_B32_e64>;
defm : SelectPat <i32, V_CNDMASK_B32_e64>;
defm : SelectPat <f16, V_CNDMASK_B32_e64>;
defm : SelectPat <f32, V_CNDMASK_B32_e64>;
def : Pat <
(i32 (add (i32 (ctpop i32:$popcnt)), i32:$val)),
(V_BCNT_U32_B32_e64 $popcnt, $val)
>;
/********** ============================================ **********/
/********** Extraction, Insertion, Building and Casting **********/
/********** ============================================ **********/
foreach Index = 0-2 in {
def Extract_Element_v2i32_#Index : Extract_Element <
i32, v2i32, Index, !cast<SubRegIndex>(sub#Index)
>;
def Insert_Element_v2i32_#Index : Insert_Element <
i32, v2i32, Index, !cast<SubRegIndex>(sub#Index)
>;
def Extract_Element_v2f32_#Index : Extract_Element <
f32, v2f32, Index, !cast<SubRegIndex>(sub#Index)
>;
def Insert_Element_v2f32_#Index : Insert_Element <
f32, v2f32, Index, !cast<SubRegIndex>(sub#Index)
>;
}
foreach Index = 0-3 in {
def Extract_Element_v4i32_#Index : Extract_Element <
i32, v4i32, Index, !cast<SubRegIndex>(sub#Index)
>;
def Insert_Element_v4i32_#Index : Insert_Element <
i32, v4i32, Index, !cast<SubRegIndex>(sub#Index)
>;
def Extract_Element_v4f32_#Index : Extract_Element <
f32, v4f32, Index, !cast<SubRegIndex>(sub#Index)
>;
def Insert_Element_v4f32_#Index : Insert_Element <
f32, v4f32, Index, !cast<SubRegIndex>(sub#Index)
>;
}
foreach Index = 0-7 in {
def Extract_Element_v8i32_#Index : Extract_Element <
i32, v8i32, Index, !cast<SubRegIndex>(sub#Index)
>;
def Insert_Element_v8i32_#Index : Insert_Element <
i32, v8i32, Index, !cast<SubRegIndex>(sub#Index)
>;
def Extract_Element_v8f32_#Index : Extract_Element <
f32, v8f32, Index, !cast<SubRegIndex>(sub#Index)
>;
def Insert_Element_v8f32_#Index : Insert_Element <
f32, v8f32, Index, !cast<SubRegIndex>(sub#Index)
>;
}
foreach Index = 0-15 in {
def Extract_Element_v16i32_#Index : Extract_Element <
i32, v16i32, Index, !cast<SubRegIndex>(sub#Index)
>;
def Insert_Element_v16i32_#Index : Insert_Element <
i32, v16i32, Index, !cast<SubRegIndex>(sub#Index)
>;
def Extract_Element_v16f32_#Index : Extract_Element <
f32, v16f32, Index, !cast<SubRegIndex>(sub#Index)
>;
def Insert_Element_v16f32_#Index : Insert_Element <
f32, v16f32, Index, !cast<SubRegIndex>(sub#Index)
>;
}
// FIXME: Why do only some of these type combinations for SReg and
// VReg?
// 16-bit bitcast
def : BitConvert <i16, f16, VGPR_32>;
def : BitConvert <f16, i16, VGPR_32>;
def : BitConvert <i16, f16, SReg_32>;
def : BitConvert <f16, i16, SReg_32>;
// 32-bit bitcast
def : BitConvert <i32, f32, VGPR_32>;
def : BitConvert <f32, i32, VGPR_32>;
def : BitConvert <i32, f32, SReg_32>;
def : BitConvert <f32, i32, SReg_32>;
// 64-bit bitcast
def : BitConvert <i64, f64, VReg_64>;
def : BitConvert <f64, i64, VReg_64>;
def : BitConvert <v2i32, v2f32, VReg_64>;
def : BitConvert <v2f32, v2i32, VReg_64>;
def : BitConvert <i64, v2i32, VReg_64>;
def : BitConvert <v2i32, i64, VReg_64>;
def : BitConvert <i64, v2f32, VReg_64>;
def : BitConvert <v2f32, i64, VReg_64>;
def : BitConvert <f64, v2f32, VReg_64>;
def : BitConvert <v2f32, f64, VReg_64>;
def : BitConvert <f64, v2i32, VReg_64>;
def : BitConvert <v2i32, f64, VReg_64>;
def : BitConvert <v4i32, v4f32, VReg_128>;
def : BitConvert <v4f32, v4i32, VReg_128>;
// 128-bit bitcast
def : BitConvert <v2i64, v4i32, SReg_128>;
def : BitConvert <v4i32, v2i64, SReg_128>;
def : BitConvert <v2f64, v4f32, VReg_128>;
def : BitConvert <v2f64, v4i32, VReg_128>;
def : BitConvert <v4f32, v2f64, VReg_128>;
def : BitConvert <v4i32, v2f64, VReg_128>;
def : BitConvert <v2i64, v2f64, VReg_128>;
def : BitConvert <v2f64, v2i64, VReg_128>;
// 256-bit bitcast
def : BitConvert <v8i32, v8f32, SReg_256>;
def : BitConvert <v8f32, v8i32, SReg_256>;
def : BitConvert <v8i32, v8f32, VReg_256>;
def : BitConvert <v8f32, v8i32, VReg_256>;
// 512-bit bitcast
def : BitConvert <v16i32, v16f32, VReg_512>;
def : BitConvert <v16f32, v16i32, VReg_512>;
/********** =================== **********/
/********** Src & Dst modifiers **********/
/********** =================== **********/
def : Pat <
(AMDGPUclamp (VOP3Mods0Clamp f32:$src0, i32:$src0_modifiers, i32:$omod),
(f32 FP_ZERO), (f32 FP_ONE)),
(V_ADD_F32_e64 $src0_modifiers, $src0, 0, (i32 0), 1, $omod)
>;
/********** ================================ **********/
/********** Floating point absolute/negative **********/
/********** ================================ **********/
// Prevent expanding both fneg and fabs.
def : Pat <
(fneg (fabs f32:$src)),
(S_OR_B32 $src, (S_MOV_B32(i32 0x80000000))) // Set sign bit
>;
// FIXME: Should use S_OR_B32
def : Pat <
(fneg (fabs f64:$src)),
(REG_SEQUENCE VReg_64,
(i32 (EXTRACT_SUBREG f64:$src, sub0)),
sub0,
(V_OR_B32_e32 (i32 (EXTRACT_SUBREG f64:$src, sub1)),
(V_MOV_B32_e32 (i32 0x80000000))), // Set sign bit.
sub1)
>;
def : Pat <
(fabs f32:$src),
(V_AND_B32_e64 $src, (V_MOV_B32_e32 (i32 0x7fffffff)))
>;
def : Pat <
(fneg f32:$src),
(V_XOR_B32_e32 $src, (V_MOV_B32_e32 (i32 0x80000000)))
>;
def : Pat <
(fabs f64:$src),
(REG_SEQUENCE VReg_64,
(i32 (EXTRACT_SUBREG f64:$src, sub0)),
sub0,
(V_AND_B32_e64 (i32 (EXTRACT_SUBREG f64:$src, sub1)),
(V_MOV_B32_e32 (i32 0x7fffffff))), // Set sign bit.
sub1)
>;
def : Pat <
(fneg f64:$src),
(REG_SEQUENCE VReg_64,
(i32 (EXTRACT_SUBREG f64:$src, sub0)),
sub0,
(V_XOR_B32_e32 (i32 (EXTRACT_SUBREG f64:$src, sub1)),
(i32 (V_MOV_B32_e32 (i32 0x80000000)))),
sub1)
>;
def : Pat <
(fneg f16:$src),
(V_XOR_B32_e32 $src, (V_MOV_B32_e32 (i32 0x00008000)))
>;
def : Pat <
(fabs f16:$src),
(V_AND_B32_e64 $src, (V_MOV_B32_e32 (i32 0x00007fff)))
>;
def : Pat <
(fneg (fabs f16:$src)),
(S_OR_B32 $src, (S_MOV_B32 (i32 0x00008000))) // Set sign bit
>;
/********** ================== **********/
/********** Immediate Patterns **********/
/********** ================== **********/
def : Pat <
(VGPRImm<(i32 imm)>:$imm),
(V_MOV_B32_e32 imm:$imm)
>;
def : Pat <
(VGPRImm<(f32 fpimm)>:$imm),
(V_MOV_B32_e32 (f32 (bitcast_fpimm_to_i32 $imm)))
>;
def : Pat <
(i32 imm:$imm),
(S_MOV_B32 imm:$imm)
>;
// FIXME: Workaround for ordering issue with peephole optimizer where
// a register class copy interferes with immediate folding. Should
// use s_mov_b32, which can be shrunk to s_movk_i32
def : Pat <
(VGPRImm<(f16 fpimm)>:$imm),
(V_MOV_B32_e32 (f16 (bitcast_fpimm_to_i32 $imm)))
>;
def : Pat <
(f32 fpimm:$imm),
(S_MOV_B32 (f32 (bitcast_fpimm_to_i32 $imm)))
>;
def : Pat <
(f16 fpimm:$imm),
(S_MOV_B32 (i32 (bitcast_fpimm_to_i32 $imm)))
>;
def : Pat <
(i32 frameindex:$fi),
(V_MOV_B32_e32 (i32 (frameindex_to_targetframeindex $fi)))
>;
def : Pat <
(i64 InlineImm<i64>:$imm),
(S_MOV_B64 InlineImm<i64>:$imm)
>;
// XXX - Should this use a s_cmp to set SCC?
// Set to sign-extended 64-bit value (true = -1, false = 0)
def : Pat <
(i1 imm:$imm),
(S_MOV_B64 (i64 (as_i64imm $imm)))
>;
def : Pat <
(f64 InlineFPImm<f64>:$imm),
(S_MOV_B64 (f64 (bitcast_fpimm_to_i64 InlineFPImm<f64>:$imm)))
>;
/********** ================== **********/
/********** Intrinsic Patterns **********/
/********** ================== **********/
def : POW_Common <V_LOG_F32_e32, V_EXP_F32_e32, V_MUL_LEGACY_F32_e32>;
def : Pat <
(int_AMDGPU_cube v4f32:$src),
(REG_SEQUENCE VReg_128,
(V_CUBETC_F32 0 /* src0_modifiers */, (f32 (EXTRACT_SUBREG $src, sub0)),
0 /* src1_modifiers */, (f32 (EXTRACT_SUBREG $src, sub1)),
0 /* src2_modifiers */, (f32 (EXTRACT_SUBREG $src, sub2)),
0 /* clamp */, 0 /* omod */), sub0,
(V_CUBESC_F32 0 /* src0_modifiers */, (f32 (EXTRACT_SUBREG $src, sub0)),
0 /* src1_modifiers */,(f32 (EXTRACT_SUBREG $src, sub1)),
0 /* src2_modifiers */,(f32 (EXTRACT_SUBREG $src, sub2)),
0 /* clamp */, 0 /* omod */), sub1,
(V_CUBEMA_F32 0 /* src1_modifiers */,(f32 (EXTRACT_SUBREG $src, sub0)),
0 /* src1_modifiers */,(f32 (EXTRACT_SUBREG $src, sub1)),
0 /* src1_modifiers */,(f32 (EXTRACT_SUBREG $src, sub2)),
0 /* clamp */, 0 /* omod */), sub2,
(V_CUBEID_F32 0 /* src1_modifiers */,(f32 (EXTRACT_SUBREG $src, sub0)),
0 /* src1_modifiers */,(f32 (EXTRACT_SUBREG $src, sub1)),
0 /* src1_modifiers */,(f32 (EXTRACT_SUBREG $src, sub2)),
0 /* clamp */, 0 /* omod */), sub3)
>;
def : Pat <
(i32 (sext i1:$src0)),
(V_CNDMASK_B32_e64 (i32 0), (i32 -1), $src0)
>;
class Ext32Pat <SDNode ext> : Pat <
(i32 (ext i1:$src0)),
(V_CNDMASK_B32_e64 (i32 0), (i32 1), $src0)
>;
def : Ext32Pat <zext>;
def : Ext32Pat <anyext>;
// The multiplication scales from [0,1] to the unsigned integer range
def : Pat <
(AMDGPUurecip i32:$src0),
(V_CVT_U32_F32_e32
(V_MUL_F32_e32 (i32 CONST.FP_UINT_MAX_PLUS_1),
(V_RCP_IFLAG_F32_e32 (V_CVT_F32_U32_e32 $src0))))
>;
//===----------------------------------------------------------------------===//
// VOP3 Patterns
//===----------------------------------------------------------------------===//
def : IMad24Pat<V_MAD_I32_I24>;
def : UMad24Pat<V_MAD_U32_U24>;
defm : BFIPatterns <V_BFI_B32, S_MOV_B32, SReg_64>;
def : ROTRPattern <V_ALIGNBIT_B32>;
/********** ====================== **********/
/********** Indirect addressing **********/
/********** ====================== **********/
multiclass SI_INDIRECT_Pattern <ValueType vt, ValueType eltvt, string VecSize> {
// Extract with offset
def : Pat<
(eltvt (extractelt vt:$src, (MOVRELOffset i32:$idx, (i32 imm:$offset)))),
(!cast<Instruction>("SI_INDIRECT_SRC_"#VecSize) $src, $idx, imm:$offset)
>;
// Insert with offset
def : Pat<
(insertelt vt:$src, eltvt:$val, (MOVRELOffset i32:$idx, (i32 imm:$offset))),
(!cast<Instruction>("SI_INDIRECT_DST_"#VecSize) $src, $idx, imm:$offset, $val)
>;
}
defm : SI_INDIRECT_Pattern <v2f32, f32, "V2">;
defm : SI_INDIRECT_Pattern <v4f32, f32, "V4">;
defm : SI_INDIRECT_Pattern <v8f32, f32, "V8">;
defm : SI_INDIRECT_Pattern <v16f32, f32, "V16">;
defm : SI_INDIRECT_Pattern <v2i32, i32, "V2">;
defm : SI_INDIRECT_Pattern <v4i32, i32, "V4">;
defm : SI_INDIRECT_Pattern <v8i32, i32, "V8">;
defm : SI_INDIRECT_Pattern <v16i32, i32, "V16">;
//===----------------------------------------------------------------------===//
// SAD Patterns
//===----------------------------------------------------------------------===//
def : Pat <
(add (sub_oneuse (umax i32:$src0, i32:$src1),
(umin i32:$src0, i32:$src1)),
i32:$src2),
(V_SAD_U32 $src0, $src1, $src2)
>;
def : Pat <
(add (select_oneuse (i1 (setugt i32:$src0, i32:$src1)),
(sub i32:$src0, i32:$src1),
(sub i32:$src1, i32:$src0)),
i32:$src2),
(V_SAD_U32 $src0, $src1, $src2)
>;
//===----------------------------------------------------------------------===//
// Conversion Patterns
//===----------------------------------------------------------------------===//
def : Pat<(i32 (sext_inreg i32:$src, i1)),
(S_BFE_I32 i32:$src, (i32 65536))>; // 0 | 1 << 16
// Handle sext_inreg in i64
def : Pat <
(i64 (sext_inreg i64:$src, i1)),
(S_BFE_I64 i64:$src, (i32 0x10000)) // 0 | 1 << 16
>;
def : Pat <
(i16 (sext_inreg i16:$src, i1)),
(S_BFE_I32 $src, (i32 0x00010000)) // 0 | 1 << 16
>;
def : Pat <
(i16 (sext_inreg i16:$src, i8)),
(S_BFE_I32 $src, (i32 0x80000)) // 0 | 8 << 16
>;
def : Pat <
(i64 (sext_inreg i64:$src, i8)),
(S_BFE_I64 i64:$src, (i32 0x80000)) // 0 | 8 << 16
>;
def : Pat <
(i64 (sext_inreg i64:$src, i16)),
(S_BFE_I64 i64:$src, (i32 0x100000)) // 0 | 16 << 16
>;
def : Pat <
(i64 (sext_inreg i64:$src, i32)),
(S_BFE_I64 i64:$src, (i32 0x200000)) // 0 | 32 << 16
>;
def : Pat <
(i64 (zext i32:$src)),
(REG_SEQUENCE SReg_64, $src, sub0, (S_MOV_B32 (i32 0)), sub1)
>;
def : Pat <
(i64 (anyext i32:$src)),
(REG_SEQUENCE SReg_64, $src, sub0, (i32 (IMPLICIT_DEF)), sub1)
>;
class ZExt_i64_i1_Pat <SDNode ext> : Pat <
(i64 (ext i1:$src)),
(REG_SEQUENCE VReg_64,
(V_CNDMASK_B32_e64 (i32 0), (i32 1), $src), sub0,
(S_MOV_B32 (i32 0)), sub1)
>;
def : ZExt_i64_i1_Pat<zext>;
def : ZExt_i64_i1_Pat<anyext>;
// FIXME: We need to use COPY_TO_REGCLASS to work-around the fact that
// REG_SEQUENCE patterns don't support instructions with multiple outputs.
def : Pat <
(i64 (sext i32:$src)),
(REG_SEQUENCE SReg_64, $src, sub0,
(i32 (COPY_TO_REGCLASS (S_ASHR_I32 $src, (i32 31)), SReg_32_XM0)), sub1)
>;
def : Pat <
(i64 (sext i1:$src)),
(REG_SEQUENCE VReg_64,
(V_CNDMASK_B32_e64 (i32 0), (i32 -1), $src), sub0,
(V_CNDMASK_B32_e64 (i32 0), (i32 -1), $src), sub1)
>;
class FPToI1Pat<Instruction Inst, int KOne, ValueType kone_type, ValueType vt, SDPatternOperator fp_to_int> : Pat <
(i1 (fp_to_int (vt (VOP3Mods vt:$src0, i32:$src0_modifiers)))),
(i1 (Inst 0, (kone_type KOne), $src0_modifiers, $src0, DSTCLAMP.NONE, DSTOMOD.NONE))
>;
def : FPToI1Pat<V_CMP_EQ_F32_e64, CONST.FP32_ONE, i32, f32, fp_to_uint>;
def : FPToI1Pat<V_CMP_EQ_F32_e64, CONST.FP32_NEG_ONE, i32, f32, fp_to_sint>;
def : FPToI1Pat<V_CMP_EQ_F64_e64, CONST.FP64_ONE, i64, f64, fp_to_uint>;
def : FPToI1Pat<V_CMP_EQ_F64_e64, CONST.FP64_NEG_ONE, i64, f64, fp_to_sint>;
// If we need to perform a logical operation on i1 values, we need to
// use vector comparisons since there is only one SCC register. Vector
// comparisons still write to a pair of SGPRs, so treat these as
// 64-bit comparisons. When legalizing SGPR copies, instructions
// resulting in the copies from SCC to these instructions will be
// moved to the VALU.
def : Pat <
(i1 (and i1:$src0, i1:$src1)),
(S_AND_B64 $src0, $src1)
>;
def : Pat <
(i1 (or i1:$src0, i1:$src1)),
(S_OR_B64 $src0, $src1)
>;
def : Pat <
(i1 (xor i1:$src0, i1:$src1)),
(S_XOR_B64 $src0, $src1)
>;
def : Pat <
(f32 (sint_to_fp i1:$src)),
(V_CNDMASK_B32_e64 (i32 0), (i32 CONST.FP32_NEG_ONE), $src)
>;
def : Pat <
(f32 (uint_to_fp i1:$src)),
(V_CNDMASK_B32_e64 (i32 0), (i32 CONST.FP32_ONE), $src)
>;
def : Pat <
(f64 (sint_to_fp i1:$src)),
(V_CVT_F64_I32_e32 (V_CNDMASK_B32_e64 (i32 0), (i32 -1), $src))
>;
def : Pat <
(f64 (uint_to_fp i1:$src)),
(V_CVT_F64_U32_e32 (V_CNDMASK_B32_e64 (i32 0), (i32 1), $src))
>;
//===----------------------------------------------------------------------===//
// Miscellaneous Patterns
//===----------------------------------------------------------------------===//
def : Pat <
(i32 (trunc i64:$a)),
(EXTRACT_SUBREG $a, sub0)
>;
def : Pat <
(i1 (trunc i32:$a)),
(V_CMP_EQ_U32_e64 (S_AND_B32 (i32 1), $a), (i32 1))
>;
def : Pat <
+ (i1 (trunc i16:$a)),
+ (V_CMP_EQ_U32_e64 (S_AND_B32 (i32 1), $a), (i32 1))
+>;
+
+def : Pat <
(i1 (trunc i64:$a)),
(V_CMP_EQ_U32_e64 (S_AND_B32 (i32 1),
(i32 (EXTRACT_SUBREG $a, sub0))), (i32 1))
>;
def : Pat <
(i32 (bswap i32:$a)),
(V_BFI_B32 (S_MOV_B32 (i32 0x00ff00ff)),
(V_ALIGNBIT_B32 $a, $a, (i32 24)),
(V_ALIGNBIT_B32 $a, $a, (i32 8)))
>;
multiclass BFMPatterns <ValueType vt, InstSI BFM, InstSI MOV> {
def : Pat <
(vt (shl (vt (add (vt (shl 1, vt:$a)), -1)), vt:$b)),
(BFM $a, $b)
>;
def : Pat <
(vt (add (vt (shl 1, vt:$a)), -1)),
(BFM $a, (MOV (i32 0)))
>;
}
defm : BFMPatterns <i32, S_BFM_B32, S_MOV_B32>;
// FIXME: defm : BFMPatterns <i64, S_BFM_B64, S_MOV_B64>;
def : BFEPattern <V_BFE_U32, S_MOV_B32>;
def : Pat<
(fcanonicalize f16:$src),
(V_MUL_F16_e64 0, (i32 CONST.FP16_ONE), 0, $src, 0, 0)
>;
def : Pat<
(fcanonicalize f32:$src),
(V_MUL_F32_e64 0, (i32 CONST.FP32_ONE), 0, $src, 0, 0)
>;
def : Pat<
(fcanonicalize f64:$src),
(V_MUL_F64 0, CONST.FP64_ONE, 0, $src, 0, 0)
>;
//===----------------------------------------------------------------------===//
// Fract Patterns
//===----------------------------------------------------------------------===//
let Predicates = [isSI] in {
// V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) is
// used instead. However, SI doesn't have V_FLOOR_F64, so the most efficient
// way to implement it is using V_FRACT_F64.
// The workaround for the V_FRACT bug is:
// fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
// Convert floor(x) to (x - fract(x))
def : Pat <
(f64 (ffloor (f64 (VOP3Mods f64:$x, i32:$mods)))),
(V_ADD_F64
$mods,
$x,
SRCMODS.NEG,
(V_CNDMASK_B64_PSEUDO
(V_MIN_F64
SRCMODS.NONE,
(V_FRACT_F64_e64 $mods, $x, DSTCLAMP.NONE, DSTOMOD.NONE),
SRCMODS.NONE,
(V_MOV_B64_PSEUDO 0x3fefffffffffffff),
DSTCLAMP.NONE, DSTOMOD.NONE),
$x,
(V_CMP_CLASS_F64_e64 SRCMODS.NONE, $x, (i32 3 /*NaN*/))),
DSTCLAMP.NONE, DSTOMOD.NONE)
>;
} // End Predicates = [isSI]
//============================================================================//
// Miscellaneous Optimization Patterns
//============================================================================//
def : SHA256MaPattern <V_BFI_B32, V_XOR_B32_e64>;
def : IntMed3Pat<V_MED3_I32, smax, smax_oneuse, smin_oneuse>;
def : IntMed3Pat<V_MED3_U32, umax, umax_oneuse, umin_oneuse>;
//============================================================================//
// Assembler aliases
//============================================================================//
def : MnemonicAlias<"v_add_u32", "v_add_i32">;
def : MnemonicAlias<"v_sub_u32", "v_sub_i32">;
def : MnemonicAlias<"v_subrev_u32", "v_subrev_i32">;
} // End isGCN predicate
Index: projects/clang400-import/contrib/llvm/lib/Target/AMDGPU/VOP1Instructions.td
===================================================================
--- projects/clang400-import/contrib/llvm/lib/Target/AMDGPU/VOP1Instructions.td (revision 314268)
+++ projects/clang400-import/contrib/llvm/lib/Target/AMDGPU/VOP1Instructions.td (revision 314269)
@@ -1,621 +1,615 @@
//===-- VOP1Instructions.td - Vector Instruction Defintions ---------------===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//===----------------------------------------------------------------------===//
// VOP1 Classes
//===----------------------------------------------------------------------===//
class VOP1e <bits<8> op, VOPProfile P> : Enc32 {
bits<8> vdst;
bits<9> src0;
let Inst{8-0} = !if(P.HasSrc0, src0{8-0}, 0);
let Inst{16-9} = op;
let Inst{24-17} = !if(P.EmitDst, vdst{7-0}, 0);
let Inst{31-25} = 0x3f; //encoding
}
class VOP1_SDWAe <bits<8> op, VOPProfile P> : VOP_SDWAe <P> {
bits<8> vdst;
let Inst{8-0} = 0xf9; // sdwa
let Inst{16-9} = op;
let Inst{24-17} = !if(P.EmitDst, vdst{7-0}, 0);
let Inst{31-25} = 0x3f; // encoding
}
class VOP1_Pseudo <string opName, VOPProfile P, list<dag> pattern=[]> :
InstSI <P.Outs32, P.Ins32, "", pattern>,
VOP <opName>,
SIMCInstr <opName#"_e32", SIEncodingFamily.NONE>,
MnemonicAlias<opName#"_e32", opName> {
let isPseudo = 1;
let isCodeGenOnly = 1;
let UseNamedOperandTable = 1;
string Mnemonic = opName;
string AsmOperands = P.Asm32;
let Size = 4;
let mayLoad = 0;
let mayStore = 0;
let hasSideEffects = 0;
let SubtargetPredicate = isGCN;
let VOP1 = 1;
let VALU = 1;
let Uses = [EXEC];
let AsmVariantName = AMDGPUAsmVariants.Default;
VOPProfile Pfl = P;
}
class VOP1_Real <VOP1_Pseudo ps, int EncodingFamily> :
InstSI <ps.OutOperandList, ps.InOperandList, ps.Mnemonic # ps.AsmOperands, []>,
SIMCInstr <ps.PseudoInstr, EncodingFamily> {
let isPseudo = 0;
let isCodeGenOnly = 0;
let Constraints = ps.Constraints;
let DisableEncoding = ps.DisableEncoding;
// copy relevant pseudo op flags
let SubtargetPredicate = ps.SubtargetPredicate;
let AsmMatchConverter = ps.AsmMatchConverter;
let AsmVariantName = ps.AsmVariantName;
let Constraints = ps.Constraints;
let DisableEncoding = ps.DisableEncoding;
let TSFlags = ps.TSFlags;
}
class VOP1_SDWA_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> :
VOP_SDWA_Pseudo <OpName, P, pattern> {
let AsmMatchConverter = "cvtSdwaVOP1";
}
class getVOP1Pat64 <SDPatternOperator node, VOPProfile P> : LetDummies {
list<dag> ret = !if(P.HasModifiers,
[(set P.DstVT:$vdst, (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0,
i32:$src0_modifiers, i1:$clamp, i32:$omod))))],
[(set P.DstVT:$vdst, (node P.Src0VT:$src0))]);
}
multiclass VOP1Inst <string opName, VOPProfile P,
SDPatternOperator node = null_frag> {
def _e32 : VOP1_Pseudo <opName, P>;
def _e64 : VOP3_Pseudo <opName, P, getVOP1Pat64<node, P>.ret>;
def _sdwa : VOP1_SDWA_Pseudo <opName, P>;
}
//===----------------------------------------------------------------------===//
// VOP1 Instructions
//===----------------------------------------------------------------------===//
let VOPAsmPrefer32Bit = 1 in {
defm V_NOP : VOP1Inst <"v_nop", VOP_NONE>;
}
let isMoveImm = 1, isReMaterializable = 1, isAsCheapAsAMove = 1 in {
defm V_MOV_B32 : VOP1Inst <"v_mov_b32", VOP_I32_I32>;
} // End isMoveImm = 1
// FIXME: Specify SchedRW for READFIRSTLANE_B32
// TODO: Make profile for this, there is VOP3 encoding also
def V_READFIRSTLANE_B32 :
InstSI <(outs SReg_32:$vdst),
(ins VGPR_32:$src0),
"v_readfirstlane_b32 $vdst, $src0",
[(set i32:$vdst, (int_amdgcn_readfirstlane i32:$src0))]>,
Enc32 {
let isCodeGenOnly = 0;
let UseNamedOperandTable = 1;
let Size = 4;
let mayLoad = 0;
let mayStore = 0;
let hasSideEffects = 0;
let SubtargetPredicate = isGCN;
let VOP1 = 1;
let VALU = 1;
let Uses = [EXEC];
let isConvergent = 1;
bits<8> vdst;
bits<9> src0;
let Inst{8-0} = src0;
let Inst{16-9} = 0x2;
let Inst{24-17} = vdst;
let Inst{31-25} = 0x3f; //encoding
}
let SchedRW = [WriteQuarterRate32] in {
defm V_CVT_I32_F64 : VOP1Inst <"v_cvt_i32_f64", VOP_I32_F64, fp_to_sint>;
defm V_CVT_F64_I32 : VOP1Inst <"v_cvt_f64_i32", VOP_F64_I32, sint_to_fp>;
defm V_CVT_F32_I32 : VOP1Inst <"v_cvt_f32_i32", VOP_F32_I32, sint_to_fp>;
defm V_CVT_F32_U32 : VOP1Inst <"v_cvt_f32_u32", VOP_F32_I32, uint_to_fp>;
defm V_CVT_U32_F32 : VOP1Inst <"v_cvt_u32_f32", VOP_I32_F32, fp_to_uint>;
defm V_CVT_I32_F32 : VOP1Inst <"v_cvt_i32_f32", VOP_I32_F32, fp_to_sint>;
defm V_CVT_F16_F32 : VOP1Inst <"v_cvt_f16_f32", VOP_I32_F32, fp_to_f16>;
defm V_CVT_F32_F16 : VOP1Inst <"v_cvt_f32_f16", VOP_F32_I32, f16_to_fp>;
defm V_CVT_RPI_I32_F32 : VOP1Inst <"v_cvt_rpi_i32_f32", VOP_I32_F32, cvt_rpi_i32_f32>;
defm V_CVT_FLR_I32_F32 : VOP1Inst <"v_cvt_flr_i32_f32", VOP_I32_F32, cvt_flr_i32_f32>;
defm V_CVT_OFF_F32_I4 : VOP1Inst <"v_cvt_off_f32_i4", VOP_F32_I32>;
defm V_CVT_F32_F64 : VOP1Inst <"v_cvt_f32_f64", VOP_F32_F64, fpround>;
defm V_CVT_F64_F32 : VOP1Inst <"v_cvt_f64_f32", VOP_F64_F32, fpextend>;
defm V_CVT_F32_UBYTE0 : VOP1Inst <"v_cvt_f32_ubyte0", VOP_F32_I32, AMDGPUcvt_f32_ubyte0>;
defm V_CVT_F32_UBYTE1 : VOP1Inst <"v_cvt_f32_ubyte1", VOP_F32_I32, AMDGPUcvt_f32_ubyte1>;
defm V_CVT_F32_UBYTE2 : VOP1Inst <"v_cvt_f32_ubyte2", VOP_F32_I32, AMDGPUcvt_f32_ubyte2>;
defm V_CVT_F32_UBYTE3 : VOP1Inst <"v_cvt_f32_ubyte3", VOP_F32_I32, AMDGPUcvt_f32_ubyte3>;
defm V_CVT_U32_F64 : VOP1Inst <"v_cvt_u32_f64", VOP_I32_F64, fp_to_uint>;
defm V_CVT_F64_U32 : VOP1Inst <"v_cvt_f64_u32", VOP_F64_I32, uint_to_fp>;
} // End SchedRW = [WriteQuarterRate32]
defm V_FRACT_F32 : VOP1Inst <"v_fract_f32", VOP_F32_F32, AMDGPUfract>;
defm V_TRUNC_F32 : VOP1Inst <"v_trunc_f32", VOP_F32_F32, ftrunc>;
defm V_CEIL_F32 : VOP1Inst <"v_ceil_f32", VOP_F32_F32, fceil>;
defm V_RNDNE_F32 : VOP1Inst <"v_rndne_f32", VOP_F32_F32, frint>;
defm V_FLOOR_F32 : VOP1Inst <"v_floor_f32", VOP_F32_F32, ffloor>;
defm V_EXP_F32 : VOP1Inst <"v_exp_f32", VOP_F32_F32, fexp2>;
let SchedRW = [WriteQuarterRate32] in {
defm V_LOG_F32 : VOP1Inst <"v_log_f32", VOP_F32_F32, flog2>;
defm V_RCP_F32 : VOP1Inst <"v_rcp_f32", VOP_F32_F32, AMDGPUrcp>;
defm V_RCP_IFLAG_F32 : VOP1Inst <"v_rcp_iflag_f32", VOP_F32_F32>;
defm V_RSQ_F32 : VOP1Inst <"v_rsq_f32", VOP_F32_F32, AMDGPUrsq>;
} // End SchedRW = [WriteQuarterRate32]
let SchedRW = [WriteDouble] in {
defm V_RCP_F64 : VOP1Inst <"v_rcp_f64", VOP_F64_F64, AMDGPUrcp>;
defm V_RSQ_F64 : VOP1Inst <"v_rsq_f64", VOP_F64_F64, AMDGPUrsq>;
} // End SchedRW = [WriteDouble];
defm V_SQRT_F32 : VOP1Inst <"v_sqrt_f32", VOP_F32_F32, fsqrt>;
let SchedRW = [WriteDouble] in {
defm V_SQRT_F64 : VOP1Inst <"v_sqrt_f64", VOP_F64_F64, fsqrt>;
} // End SchedRW = [WriteDouble]
let SchedRW = [WriteQuarterRate32] in {
defm V_SIN_F32 : VOP1Inst <"v_sin_f32", VOP_F32_F32, AMDGPUsin>;
defm V_COS_F32 : VOP1Inst <"v_cos_f32", VOP_F32_F32, AMDGPUcos>;
} // End SchedRW = [WriteQuarterRate32]
defm V_NOT_B32 : VOP1Inst <"v_not_b32", VOP_I32_I32>;
defm V_BFREV_B32 : VOP1Inst <"v_bfrev_b32", VOP_I32_I32>;
defm V_FFBH_U32 : VOP1Inst <"v_ffbh_u32", VOP_I32_I32>;
defm V_FFBL_B32 : VOP1Inst <"v_ffbl_b32", VOP_I32_I32>;
defm V_FFBH_I32 : VOP1Inst <"v_ffbh_i32", VOP_I32_I32>;
defm V_FREXP_EXP_I32_F64 : VOP1Inst <"v_frexp_exp_i32_f64", VOP_I32_F64, int_amdgcn_frexp_exp>;
let SchedRW = [WriteDoubleAdd] in {
defm V_FREXP_MANT_F64 : VOP1Inst <"v_frexp_mant_f64", VOP_F64_F64, int_amdgcn_frexp_mant>;
defm V_FRACT_F64 : VOP1Inst <"v_fract_f64", VOP_F64_F64, AMDGPUfract>;
} // End SchedRW = [WriteDoubleAdd]
defm V_FREXP_EXP_I32_F32 : VOP1Inst <"v_frexp_exp_i32_f32", VOP_I32_F32, int_amdgcn_frexp_exp>;
defm V_FREXP_MANT_F32 : VOP1Inst <"v_frexp_mant_f32", VOP_F32_F32, int_amdgcn_frexp_mant>;
let VOPAsmPrefer32Bit = 1 in {
defm V_CLREXCP : VOP1Inst <"v_clrexcp", VOP_NO_EXT<VOP_NONE>>;
}
// Restrict src0 to be VGPR
def VOP_I32_VI32_NO_EXT : VOPProfile<[i32, i32, untyped, untyped]> {
let Src0RC32 = VRegSrc_32;
let Src0RC64 = VRegSrc_32;
let HasExt = 0;
}
// Special case because there are no true output operands. Hack vdst
// to be a src operand. The custom inserter must add a tied implicit
// def and use of the super register since there seems to be no way to
// add an implicit def of a virtual register in tablegen.
def VOP_MOVRELD : VOPProfile<[untyped, i32, untyped, untyped]> {
let Src0RC32 = VOPDstOperand<VGPR_32>;
let Src0RC64 = VOPDstOperand<VGPR_32>;
let Outs = (outs);
let Ins32 = (ins Src0RC32:$vdst, VSrc_b32:$src0);
let Ins64 = (ins Src0RC64:$vdst, VSrc_b32:$src0);
let InsDPP = (ins Src0RC32:$vdst, Src0RC32:$src0, dpp_ctrl:$dpp_ctrl, row_mask:$row_mask,
bank_mask:$bank_mask, bound_ctrl:$bound_ctrl);
let InsSDWA = (ins Src0RC32:$vdst, Src0ModSDWA:$src0_modifiers, VCSrc_b32:$src0,
clampmod:$clamp, dst_sel:$dst_sel, dst_unused:$dst_unused,
src0_sel:$src0_sel);
let Asm32 = getAsm32<1, 1>.ret;
let Asm64 = getAsm64<1, 1, 0>.ret;
let AsmDPP = getAsmDPP<1, 1, 0>.ret;
let AsmSDWA = getAsmSDWA<1, 1, 0>.ret;
let HasExt = 0;
let HasDst = 0;
let EmitDst = 1; // force vdst emission
}
let SubtargetPredicate = HasMovrel, Uses = [M0, EXEC] in {
// v_movreld_b32 is a special case because the destination output
// register is really a source. It isn't actually read (but may be
// written), and is only to provide the base register to start
// indexing from. Tablegen seems to not let you define an implicit
// virtual register output for the super register being written into,
// so this must have an implicit def of the register added to it.
defm V_MOVRELD_B32 : VOP1Inst <"v_movreld_b32", VOP_MOVRELD>;
defm V_MOVRELS_B32 : VOP1Inst <"v_movrels_b32", VOP_I32_VI32_NO_EXT>;
defm V_MOVRELSD_B32 : VOP1Inst <"v_movrelsd_b32", VOP_NO_EXT<VOP_I32_I32>>;
} // End Uses = [M0, EXEC]
// These instruction only exist on SI and CI
let SubtargetPredicate = isSICI in {
let SchedRW = [WriteQuarterRate32] in {
defm V_MOV_FED_B32 : VOP1Inst <"v_mov_fed_b32", VOP_I32_I32>;
defm V_LOG_CLAMP_F32 : VOP1Inst <"v_log_clamp_f32", VOP_F32_F32, int_amdgcn_log_clamp>;
defm V_RCP_CLAMP_F32 : VOP1Inst <"v_rcp_clamp_f32", VOP_F32_F32>;
defm V_RCP_LEGACY_F32 : VOP1Inst <"v_rcp_legacy_f32", VOP_F32_F32, AMDGPUrcp_legacy>;
defm V_RSQ_CLAMP_F32 : VOP1Inst <"v_rsq_clamp_f32", VOP_F32_F32, AMDGPUrsq_clamp>;
defm V_RSQ_LEGACY_F32 : VOP1Inst <"v_rsq_legacy_f32", VOP_F32_F32, AMDGPUrsq_legacy>;
} // End SchedRW = [WriteQuarterRate32]
let SchedRW = [WriteDouble] in {
defm V_RCP_CLAMP_F64 : VOP1Inst <"v_rcp_clamp_f64", VOP_F64_F64>;
defm V_RSQ_CLAMP_F64 : VOP1Inst <"v_rsq_clamp_f64", VOP_F64_F64, AMDGPUrsq_clamp>;
} // End SchedRW = [WriteDouble]
} // End SubtargetPredicate = isSICI
let SubtargetPredicate = isCIVI in {
let SchedRW = [WriteDoubleAdd] in {
defm V_TRUNC_F64 : VOP1Inst <"v_trunc_f64", VOP_F64_F64, ftrunc>;
defm V_CEIL_F64 : VOP1Inst <"v_ceil_f64", VOP_F64_F64, fceil>;
defm V_FLOOR_F64 : VOP1Inst <"v_floor_f64", VOP_F64_F64, ffloor>;
defm V_RNDNE_F64 : VOP1Inst <"v_rndne_f64", VOP_F64_F64, frint>;
} // End SchedRW = [WriteDoubleAdd]
let SchedRW = [WriteQuarterRate32] in {
defm V_LOG_LEGACY_F32 : VOP1Inst <"v_log_legacy_f32", VOP_F32_F32>;
defm V_EXP_LEGACY_F32 : VOP1Inst <"v_exp_legacy_f32", VOP_F32_F32>;
} // End SchedRW = [WriteQuarterRate32]
} // End SubtargetPredicate = isCIVI
let SubtargetPredicate = isVI in {
defm V_CVT_F16_U16 : VOP1Inst <"v_cvt_f16_u16", VOP_F16_I16, uint_to_fp>;
defm V_CVT_F16_I16 : VOP1Inst <"v_cvt_f16_i16", VOP_F16_I16, sint_to_fp>;
defm V_CVT_U16_F16 : VOP1Inst <"v_cvt_u16_f16", VOP_I16_F16, fp_to_uint>;
defm V_CVT_I16_F16 : VOP1Inst <"v_cvt_i16_f16", VOP_I16_F16, fp_to_sint>;
defm V_RCP_F16 : VOP1Inst <"v_rcp_f16", VOP_F16_F16, AMDGPUrcp>;
defm V_SQRT_F16 : VOP1Inst <"v_sqrt_f16", VOP_F16_F16, fsqrt>;
defm V_RSQ_F16 : VOP1Inst <"v_rsq_f16", VOP_F16_F16, AMDGPUrsq>;
defm V_LOG_F16 : VOP1Inst <"v_log_f16", VOP_F16_F16, flog2>;
defm V_EXP_F16 : VOP1Inst <"v_exp_f16", VOP_F16_F16, fexp2>;
defm V_FREXP_MANT_F16 : VOP1Inst <"v_frexp_mant_f16", VOP_F16_F16, int_amdgcn_frexp_mant>;
defm V_FREXP_EXP_I16_F16 : VOP1Inst <"v_frexp_exp_i16_f16", VOP_I16_F16, int_amdgcn_frexp_exp>;
defm V_FLOOR_F16 : VOP1Inst <"v_floor_f16", VOP_F16_F16, ffloor>;
defm V_CEIL_F16 : VOP1Inst <"v_ceil_f16", VOP_F16_F16, fceil>;
defm V_TRUNC_F16 : VOP1Inst <"v_trunc_f16", VOP_F16_F16, ftrunc>;
defm V_RNDNE_F16 : VOP1Inst <"v_rndne_f16", VOP_F16_F16, frint>;
defm V_FRACT_F16 : VOP1Inst <"v_fract_f16", VOP_F16_F16, AMDGPUfract>;
defm V_SIN_F16 : VOP1Inst <"v_sin_f16", VOP_F16_F16, AMDGPUsin>;
defm V_COS_F16 : VOP1Inst <"v_cos_f16", VOP_F16_F16, AMDGPUcos>;
}
let Predicates = [isVI] in {
def : Pat<
(f32 (f16_to_fp i16:$src)),
(V_CVT_F32_F16_e32 $src)
>;
def : Pat<
(i16 (fp_to_f16 f32:$src)),
(V_CVT_F16_F32_e32 $src)
>;
}
//===----------------------------------------------------------------------===//
// Target
//===----------------------------------------------------------------------===//
//===----------------------------------------------------------------------===//
// SI
//===----------------------------------------------------------------------===//
multiclass VOP1_Real_si <bits<9> op> {
let AssemblerPredicates = [isSICI], DecoderNamespace = "SICI" in {
def _e32_si :
VOP1_Real<!cast<VOP1_Pseudo>(NAME#"_e32"), SIEncodingFamily.SI>,
VOP1e<op{7-0}, !cast<VOP1_Pseudo>(NAME#"_e32").Pfl>;
def _e64_si :
VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.SI>,
VOP3e_si <{1, 1, op{6-0}}, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl>;
}
}
defm V_NOP : VOP1_Real_si <0x0>;
defm V_MOV_B32 : VOP1_Real_si <0x1>;
defm V_CVT_I32_F64 : VOP1_Real_si <0x3>;
defm V_CVT_F64_I32 : VOP1_Real_si <0x4>;
defm V_CVT_F32_I32 : VOP1_Real_si <0x5>;
defm V_CVT_F32_U32 : VOP1_Real_si <0x6>;
defm V_CVT_U32_F32 : VOP1_Real_si <0x7>;
defm V_CVT_I32_F32 : VOP1_Real_si <0x8>;
defm V_MOV_FED_B32 : VOP1_Real_si <0x9>;
defm V_CVT_F16_F32 : VOP1_Real_si <0xa>;
defm V_CVT_F32_F16 : VOP1_Real_si <0xb>;
defm V_CVT_RPI_I32_F32 : VOP1_Real_si <0xc>;
defm V_CVT_FLR_I32_F32 : VOP1_Real_si <0xd>;
defm V_CVT_OFF_F32_I4 : VOP1_Real_si <0xe>;
defm V_CVT_F32_F64 : VOP1_Real_si <0xf>;
defm V_CVT_F64_F32 : VOP1_Real_si <0x10>;
defm V_CVT_F32_UBYTE0 : VOP1_Real_si <0x11>;
defm V_CVT_F32_UBYTE1 : VOP1_Real_si <0x12>;
defm V_CVT_F32_UBYTE2 : VOP1_Real_si <0x13>;
defm V_CVT_F32_UBYTE3 : VOP1_Real_si <0x14>;
defm V_CVT_U32_F64 : VOP1_Real_si <0x15>;
defm V_CVT_F64_U32 : VOP1_Real_si <0x16>;
defm V_FRACT_F32 : VOP1_Real_si <0x20>;
defm V_TRUNC_F32 : VOP1_Real_si <0x21>;
defm V_CEIL_F32 : VOP1_Real_si <0x22>;
defm V_RNDNE_F32 : VOP1_Real_si <0x23>;
defm V_FLOOR_F32 : VOP1_Real_si <0x24>;
defm V_EXP_F32 : VOP1_Real_si <0x25>;
defm V_LOG_CLAMP_F32 : VOP1_Real_si <0x26>;
defm V_LOG_F32 : VOP1_Real_si <0x27>;
defm V_RCP_CLAMP_F32 : VOP1_Real_si <0x28>;
defm V_RCP_LEGACY_F32 : VOP1_Real_si <0x29>;
defm V_RCP_F32 : VOP1_Real_si <0x2a>;
defm V_RCP_IFLAG_F32 : VOP1_Real_si <0x2b>;
defm V_RSQ_CLAMP_F32 : VOP1_Real_si <0x2c>;
defm V_RSQ_LEGACY_F32 : VOP1_Real_si <0x2d>;
defm V_RSQ_F32 : VOP1_Real_si <0x2e>;
defm V_RCP_F64 : VOP1_Real_si <0x2f>;
defm V_RCP_CLAMP_F64 : VOP1_Real_si <0x30>;
defm V_RSQ_F64 : VOP1_Real_si <0x31>;
defm V_RSQ_CLAMP_F64 : VOP1_Real_si <0x32>;
defm V_SQRT_F32 : VOP1_Real_si <0x33>;
defm V_SQRT_F64 : VOP1_Real_si <0x34>;
defm V_SIN_F32 : VOP1_Real_si <0x35>;
defm V_COS_F32 : VOP1_Real_si <0x36>;
defm V_NOT_B32 : VOP1_Real_si <0x37>;
defm V_BFREV_B32 : VOP1_Real_si <0x38>;
defm V_FFBH_U32 : VOP1_Real_si <0x39>;
defm V_FFBL_B32 : VOP1_Real_si <0x3a>;
defm V_FFBH_I32 : VOP1_Real_si <0x3b>;
defm V_FREXP_EXP_I32_F64 : VOP1_Real_si <0x3c>;
defm V_FREXP_MANT_F64 : VOP1_Real_si <0x3d>;
defm V_FRACT_F64 : VOP1_Real_si <0x3e>;
defm V_FREXP_EXP_I32_F32 : VOP1_Real_si <0x3f>;
defm V_FREXP_MANT_F32 : VOP1_Real_si <0x40>;
defm V_CLREXCP : VOP1_Real_si <0x41>;
defm V_MOVRELD_B32 : VOP1_Real_si <0x42>;
defm V_MOVRELS_B32 : VOP1_Real_si <0x43>;
defm V_MOVRELSD_B32 : VOP1_Real_si <0x44>;
//===----------------------------------------------------------------------===//
// CI
//===----------------------------------------------------------------------===//
multiclass VOP1_Real_ci <bits<9> op> {
let AssemblerPredicates = [isCIOnly], DecoderNamespace = "CI" in {
def _e32_ci :
VOP1_Real<!cast<VOP1_Pseudo>(NAME#"_e32"), SIEncodingFamily.SI>,
VOP1e<op{7-0}, !cast<VOP1_Pseudo>(NAME#"_e32").Pfl>;
def _e64_ci :
VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.SI>,
VOP3e_si <{1, 1, op{6-0}}, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl>;
}
}
defm V_TRUNC_F64 : VOP1_Real_ci <0x17>;
defm V_CEIL_F64 : VOP1_Real_ci <0x18>;
defm V_FLOOR_F64 : VOP1_Real_ci <0x1A>;
defm V_RNDNE_F64 : VOP1_Real_ci <0x19>;
defm V_LOG_LEGACY_F32 : VOP1_Real_ci <0x45>;
defm V_EXP_LEGACY_F32 : VOP1_Real_ci <0x46>;
//===----------------------------------------------------------------------===//
// VI
//===----------------------------------------------------------------------===//
class VOP1_DPP <bits<8> op, VOP1_Pseudo ps, VOPProfile P = ps.Pfl> :
VOP_DPP <ps.OpName, P> {
let Defs = ps.Defs;
let Uses = ps.Uses;
let SchedRW = ps.SchedRW;
let hasSideEffects = ps.hasSideEffects;
let Constraints = ps.Constraints;
let DisableEncoding = ps.DisableEncoding;
bits<8> vdst;
let Inst{8-0} = 0xfa; // dpp
let Inst{16-9} = op;
let Inst{24-17} = !if(P.EmitDst, vdst{7-0}, 0);
let Inst{31-25} = 0x3f; //encoding
}
multiclass VOP1_Real_vi <bits<10> op> {
let AssemblerPredicates = [isVI], DecoderNamespace = "VI" in {
def _e32_vi :
VOP1_Real<!cast<VOP1_Pseudo>(NAME#"_e32"), SIEncodingFamily.VI>,
VOP1e<op{7-0}, !cast<VOP1_Pseudo>(NAME#"_e32").Pfl>;
def _e64_vi :
VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.VI>,
VOP3e_vi <!add(0x140, op), !cast<VOP3_Pseudo>(NAME#"_e64").Pfl>;
}
def _sdwa_vi :
VOP_SDWA_Real <!cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa")>,
VOP1_SDWAe <op{7-0}, !cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa").Pfl>;
// For now left dpp only for asm/dasm
// TODO: add corresponding pseudo
def _dpp : VOP1_DPP<op{7-0}, !cast<VOP1_Pseudo>(NAME#"_e32")>;
}
defm V_NOP : VOP1_Real_vi <0x0>;
defm V_MOV_B32 : VOP1_Real_vi <0x1>;
defm V_CVT_I32_F64 : VOP1_Real_vi <0x3>;
defm V_CVT_F64_I32 : VOP1_Real_vi <0x4>;
defm V_CVT_F32_I32 : VOP1_Real_vi <0x5>;
defm V_CVT_F32_U32 : VOP1_Real_vi <0x6>;
defm V_CVT_U32_F32 : VOP1_Real_vi <0x7>;
defm V_CVT_I32_F32 : VOP1_Real_vi <0x8>;
defm V_CVT_F16_F32 : VOP1_Real_vi <0xa>;
defm V_CVT_F32_F16 : VOP1_Real_vi <0xb>;
defm V_CVT_RPI_I32_F32 : VOP1_Real_vi <0xc>;
defm V_CVT_FLR_I32_F32 : VOP1_Real_vi <0xd>;
defm V_CVT_OFF_F32_I4 : VOP1_Real_vi <0xe>;
defm V_CVT_F32_F64 : VOP1_Real_vi <0xf>;
defm V_CVT_F64_F32 : VOP1_Real_vi <0x10>;
defm V_CVT_F32_UBYTE0 : VOP1_Real_vi <0x11>;
defm V_CVT_F32_UBYTE1 : VOP1_Real_vi <0x12>;
defm V_CVT_F32_UBYTE2 : VOP1_Real_vi <0x13>;
defm V_CVT_F32_UBYTE3 : VOP1_Real_vi <0x14>;
defm V_CVT_U32_F64 : VOP1_Real_vi <0x15>;
defm V_CVT_F64_U32 : VOP1_Real_vi <0x16>;
defm V_FRACT_F32 : VOP1_Real_vi <0x1b>;
defm V_TRUNC_F32 : VOP1_Real_vi <0x1c>;
defm V_CEIL_F32 : VOP1_Real_vi <0x1d>;
defm V_RNDNE_F32 : VOP1_Real_vi <0x1e>;
defm V_FLOOR_F32 : VOP1_Real_vi <0x1f>;
defm V_EXP_F32 : VOP1_Real_vi <0x20>;
defm V_LOG_F32 : VOP1_Real_vi <0x21>;
defm V_RCP_F32 : VOP1_Real_vi <0x22>;
defm V_RCP_IFLAG_F32 : VOP1_Real_vi <0x23>;
defm V_RSQ_F32 : VOP1_Real_vi <0x24>;
defm V_RCP_F64 : VOP1_Real_vi <0x25>;
defm V_RSQ_F64 : VOP1_Real_vi <0x26>;
defm V_SQRT_F32 : VOP1_Real_vi <0x27>;
defm V_SQRT_F64 : VOP1_Real_vi <0x28>;
defm V_SIN_F32 : VOP1_Real_vi <0x29>;
defm V_COS_F32 : VOP1_Real_vi <0x2a>;
defm V_NOT_B32 : VOP1_Real_vi <0x2b>;
defm V_BFREV_B32 : VOP1_Real_vi <0x2c>;
defm V_FFBH_U32 : VOP1_Real_vi <0x2d>;
defm V_FFBL_B32 : VOP1_Real_vi <0x2e>;
defm V_FFBH_I32 : VOP1_Real_vi <0x2f>;
defm V_FREXP_EXP_I32_F64 : VOP1_Real_vi <0x30>;
defm V_FREXP_MANT_F64 : VOP1_Real_vi <0x31>;
defm V_FRACT_F64 : VOP1_Real_vi <0x32>;
defm V_FREXP_EXP_I32_F32 : VOP1_Real_vi <0x33>;
defm V_FREXP_MANT_F32 : VOP1_Real_vi <0x34>;
defm V_CLREXCP : VOP1_Real_vi <0x35>;
defm V_MOVRELD_B32 : VOP1_Real_vi <0x36>;
defm V_MOVRELS_B32 : VOP1_Real_vi <0x37>;
defm V_MOVRELSD_B32 : VOP1_Real_vi <0x38>;
defm V_TRUNC_F64 : VOP1_Real_vi <0x17>;
defm V_CEIL_F64 : VOP1_Real_vi <0x18>;
defm V_FLOOR_F64 : VOP1_Real_vi <0x1A>;
defm V_RNDNE_F64 : VOP1_Real_vi <0x19>;
defm V_LOG_LEGACY_F32 : VOP1_Real_vi <0x4c>;
defm V_EXP_LEGACY_F32 : VOP1_Real_vi <0x4b>;
defm V_CVT_F16_U16 : VOP1_Real_vi <0x39>;
defm V_CVT_F16_I16 : VOP1_Real_vi <0x3a>;
defm V_CVT_U16_F16 : VOP1_Real_vi <0x3b>;
defm V_CVT_I16_F16 : VOP1_Real_vi <0x3c>;
defm V_RCP_F16 : VOP1_Real_vi <0x3d>;
defm V_SQRT_F16 : VOP1_Real_vi <0x3e>;
defm V_RSQ_F16 : VOP1_Real_vi <0x3f>;
defm V_LOG_F16 : VOP1_Real_vi <0x40>;
defm V_EXP_F16 : VOP1_Real_vi <0x41>;
defm V_FREXP_MANT_F16 : VOP1_Real_vi <0x42>;
defm V_FREXP_EXP_I16_F16 : VOP1_Real_vi <0x43>;
defm V_FLOOR_F16 : VOP1_Real_vi <0x44>;
defm V_CEIL_F16 : VOP1_Real_vi <0x45>;
defm V_TRUNC_F16 : VOP1_Real_vi <0x46>;
defm V_RNDNE_F16 : VOP1_Real_vi <0x47>;
defm V_FRACT_F16 : VOP1_Real_vi <0x48>;
defm V_SIN_F16 : VOP1_Real_vi <0x49>;
defm V_COS_F16 : VOP1_Real_vi <0x4a>;
// Copy of v_mov_b32 with $vdst as a use operand for use with VGPR
// indexing mode. vdst can't be treated as a def for codegen purposes,
// and an implicit use and def of the super register should be added.
def V_MOV_B32_indirect : VPseudoInstSI<(outs),
(ins getVALUDstForVT<i32>.ret:$vdst, getVOPSrc0ForVT<i32>.ret:$src0)>,
PseudoInstExpansion<(V_MOV_B32_e32_vi getVALUDstForVT<i32>.ret:$vdst,
getVOPSrc0ForVT<i32>.ret:$src0)> {
let VOP1 = 1;
let SubtargetPredicate = isVI;
}
// This is a pseudo variant of the v_movreld_b32 instruction in which the
// vector operand appears only twice, once as def and once as use. Using this
// pseudo avoids problems with the Two Address instructions pass.
class V_MOVRELD_B32_pseudo<RegisterClass rc> : VPseudoInstSI <
(outs rc:$vdst),
(ins rc:$vsrc, VSrc_b32:$val, i32imm:$offset)> {
let VOP1 = 1;
let Constraints = "$vsrc = $vdst";
let Uses = [M0, EXEC];
let SubtargetPredicate = HasMovrel;
}
def V_MOVRELD_B32_V1 : V_MOVRELD_B32_pseudo<VGPR_32>;
def V_MOVRELD_B32_V2 : V_MOVRELD_B32_pseudo<VReg_64>;
def V_MOVRELD_B32_V4 : V_MOVRELD_B32_pseudo<VReg_128>;
def V_MOVRELD_B32_V8 : V_MOVRELD_B32_pseudo<VReg_256>;
def V_MOVRELD_B32_V16 : V_MOVRELD_B32_pseudo<VReg_512>;
let Predicates = [isVI] in {
def : Pat <
(i32 (int_amdgcn_mov_dpp i32:$src, imm:$dpp_ctrl, imm:$row_mask, imm:$bank_mask,
imm:$bound_ctrl)),
(V_MOV_B32_dpp $src, (as_i32imm $dpp_ctrl), (as_i32imm $row_mask),
(as_i32imm $bank_mask), (as_i1imm $bound_ctrl))
>;
def : Pat<
(i32 (anyext i16:$src)),
(COPY $src)
>;
def : Pat<
(i64 (anyext i16:$src)),
(REG_SEQUENCE VReg_64,
(i32 (COPY $src)), sub0,
(V_MOV_B32_e32 (i32 0)), sub1)
>;
def : Pat<
(i16 (trunc i32:$src)),
(COPY $src)
>;
-def : Pat<
- (i1 (trunc i16:$src)),
- (COPY $src)
->;
-
-
def : Pat <
(i16 (trunc i64:$src)),
(EXTRACT_SUBREG $src, sub0)
>;
} // End Predicates = [isVI]
Index: projects/clang400-import/contrib/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
===================================================================
--- projects/clang400-import/contrib/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp (revision 314268)
+++ projects/clang400-import/contrib/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp (revision 314269)
@@ -1,577 +1,582 @@
//===- CorrelatedValuePropagation.cpp - Propagate CFG-derived info --------===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
// This file implements the Correlated Value Propagation pass.
//
//===----------------------------------------------------------------------===//
#include "llvm/Transforms/Scalar/CorrelatedValuePropagation.h"
#include "llvm/Transforms/Scalar.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/GlobalsModRef.h"
#include "llvm/Analysis/InstructionSimplify.h"
#include "llvm/Analysis/LazyValueInfo.h"
#include "llvm/IR/CFG.h"
#include "llvm/IR/ConstantRange.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/Module.h"
#include "llvm/Pass.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Utils/Local.h"
using namespace llvm;
#define DEBUG_TYPE "correlated-value-propagation"
STATISTIC(NumPhis, "Number of phis propagated");
STATISTIC(NumSelects, "Number of selects propagated");
STATISTIC(NumMemAccess, "Number of memory access targets propagated");
STATISTIC(NumCmps, "Number of comparisons propagated");
STATISTIC(NumReturns, "Number of return values propagated");
STATISTIC(NumDeadCases, "Number of switch cases removed");
STATISTIC(NumSDivs, "Number of sdiv converted to udiv");
STATISTIC(NumAShrs, "Number of ashr converted to lshr");
STATISTIC(NumSRems, "Number of srem converted to urem");
+static cl::opt<bool> DontProcessAdds("cvp-dont-process-adds", cl::init(true));
+
namespace {
class CorrelatedValuePropagation : public FunctionPass {
public:
static char ID;
CorrelatedValuePropagation(): FunctionPass(ID) {
initializeCorrelatedValuePropagationPass(*PassRegistry::getPassRegistry());
}
bool runOnFunction(Function &F) override;
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<LazyValueInfoWrapperPass>();
AU.addPreserved<GlobalsAAWrapperPass>();
}
};
}
char CorrelatedValuePropagation::ID = 0;
INITIALIZE_PASS_BEGIN(CorrelatedValuePropagation, "correlated-propagation",
"Value Propagation", false, false)
INITIALIZE_PASS_DEPENDENCY(LazyValueInfoWrapperPass)
INITIALIZE_PASS_END(CorrelatedValuePropagation, "correlated-propagation",
"Value Propagation", false, false)
// Public interface to the Value Propagation pass
Pass *llvm::createCorrelatedValuePropagationPass() {
return new CorrelatedValuePropagation();
}
static bool processSelect(SelectInst *S, LazyValueInfo *LVI) {
if (S->getType()->isVectorTy()) return false;
if (isa<Constant>(S->getOperand(0))) return false;
Constant *C = LVI->getConstant(S->getOperand(0), S->getParent(), S);
if (!C) return false;
ConstantInt *CI = dyn_cast<ConstantInt>(C);
if (!CI) return false;
Value *ReplaceWith = S->getOperand(1);
Value *Other = S->getOperand(2);
if (!CI->isOne()) std::swap(ReplaceWith, Other);
if (ReplaceWith == S) ReplaceWith = UndefValue::get(S->getType());
S->replaceAllUsesWith(ReplaceWith);
S->eraseFromParent();
++NumSelects;
return true;
}
static bool processPHI(PHINode *P, LazyValueInfo *LVI) {
bool Changed = false;
BasicBlock *BB = P->getParent();
for (unsigned i = 0, e = P->getNumIncomingValues(); i < e; ++i) {
Value *Incoming = P->getIncomingValue(i);
if (isa<Constant>(Incoming)) continue;
Value *V = LVI->getConstantOnEdge(Incoming, P->getIncomingBlock(i), BB, P);
// Look if the incoming value is a select with a scalar condition for which
// LVI can tells us the value. In that case replace the incoming value with
// the appropriate value of the select. This often allows us to remove the
// select later.
if (!V) {
SelectInst *SI = dyn_cast<SelectInst>(Incoming);
if (!SI) continue;
Value *Condition = SI->getCondition();
if (!Condition->getType()->isVectorTy()) {
if (Constant *C = LVI->getConstantOnEdge(
Condition, P->getIncomingBlock(i), BB, P)) {
if (C->isOneValue()) {
V = SI->getTrueValue();
} else if (C->isZeroValue()) {
V = SI->getFalseValue();
}
// Once LVI learns to handle vector types, we could also add support
// for vector type constants that are not all zeroes or all ones.
}
}
// Look if the select has a constant but LVI tells us that the incoming
// value can never be that constant. In that case replace the incoming
// value with the other value of the select. This often allows us to
// remove the select later.
if (!V) {
Constant *C = dyn_cast<Constant>(SI->getFalseValue());
if (!C) continue;
if (LVI->getPredicateOnEdge(ICmpInst::ICMP_EQ, SI, C,
P->getIncomingBlock(i), BB, P) !=
LazyValueInfo::False)
continue;
V = SI->getTrueValue();
}
DEBUG(dbgs() << "CVP: Threading PHI over " << *SI << '\n');
}
P->setIncomingValue(i, V);
Changed = true;
}
// FIXME: Provide TLI, DT, AT to SimplifyInstruction.
const DataLayout &DL = BB->getModule()->getDataLayout();
if (Value *V = SimplifyInstruction(P, DL)) {
P->replaceAllUsesWith(V);
P->eraseFromParent();
Changed = true;
}
if (Changed)
++NumPhis;
return Changed;
}
static bool processMemAccess(Instruction *I, LazyValueInfo *LVI) {
Value *Pointer = nullptr;
if (LoadInst *L = dyn_cast<LoadInst>(I))
Pointer = L->getPointerOperand();
else
Pointer = cast<StoreInst>(I)->getPointerOperand();
if (isa<Constant>(Pointer)) return false;
Constant *C = LVI->getConstant(Pointer, I->getParent(), I);
if (!C) return false;
++NumMemAccess;
I->replaceUsesOfWith(Pointer, C);
return true;
}
/// See if LazyValueInfo's ability to exploit edge conditions or range
/// information is sufficient to prove this comparison. Even for local
/// conditions, this can sometimes prove conditions instcombine can't by
/// exploiting range information.
static bool processCmp(CmpInst *C, LazyValueInfo *LVI) {
Value *Op0 = C->getOperand(0);
Constant *Op1 = dyn_cast<Constant>(C->getOperand(1));
if (!Op1) return false;
// As a policy choice, we choose not to waste compile time on anything where
// the comparison is testing local values. While LVI can sometimes reason
// about such cases, it's not its primary purpose. We do make sure to do
// the block local query for uses from terminator instructions, but that's
// handled in the code for each terminator.
auto *I = dyn_cast<Instruction>(Op0);
if (I && I->getParent() == C->getParent())
return false;
LazyValueInfo::Tristate Result =
LVI->getPredicateAt(C->getPredicate(), Op0, Op1, C);
if (Result == LazyValueInfo::Unknown) return false;
++NumCmps;
if (Result == LazyValueInfo::True)
C->replaceAllUsesWith(ConstantInt::getTrue(C->getContext()));
else
C->replaceAllUsesWith(ConstantInt::getFalse(C->getContext()));
C->eraseFromParent();
return true;
}
/// Simplify a switch instruction by removing cases which can never fire. If the
/// uselessness of a case could be determined locally then constant propagation
/// would already have figured it out. Instead, walk the predecessors and
/// statically evaluate cases based on information available on that edge. Cases
/// that cannot fire no matter what the incoming edge can safely be removed. If
/// a case fires on every incoming edge then the entire switch can be removed
/// and replaced with a branch to the case destination.
static bool processSwitch(SwitchInst *SI, LazyValueInfo *LVI) {
Value *Cond = SI->getCondition();
BasicBlock *BB = SI->getParent();
// If the condition was defined in same block as the switch then LazyValueInfo
// currently won't say anything useful about it, though in theory it could.
if (isa<Instruction>(Cond) && cast<Instruction>(Cond)->getParent() == BB)
return false;
// If the switch is unreachable then trying to improve it is a waste of time.
pred_iterator PB = pred_begin(BB), PE = pred_end(BB);
if (PB == PE) return false;
// Analyse each switch case in turn. This is done in reverse order so that
// removing a case doesn't cause trouble for the iteration.
bool Changed = false;
for (SwitchInst::CaseIt CI = SI->case_end(), CE = SI->case_begin(); CI-- != CE;
) {
ConstantInt *Case = CI.getCaseValue();
// Check to see if the switch condition is equal to/not equal to the case
// value on every incoming edge, equal/not equal being the same each time.
LazyValueInfo::Tristate State = LazyValueInfo::Unknown;
for (pred_iterator PI = PB; PI != PE; ++PI) {
// Is the switch condition equal to the case value?
LazyValueInfo::Tristate Value = LVI->getPredicateOnEdge(CmpInst::ICMP_EQ,
Cond, Case, *PI,
BB, SI);
// Give up on this case if nothing is known.
if (Value == LazyValueInfo::Unknown) {
State = LazyValueInfo::Unknown;
break;
}
// If this was the first edge to be visited, record that all other edges
// need to give the same result.
if (PI == PB) {
State = Value;
continue;
}
// If this case is known to fire for some edges and known not to fire for
// others then there is nothing we can do - give up.
if (Value != State) {
State = LazyValueInfo::Unknown;
break;
}
}
if (State == LazyValueInfo::False) {
// This case never fires - remove it.
CI.getCaseSuccessor()->removePredecessor(BB);
SI->removeCase(CI); // Does not invalidate the iterator.
// The condition can be modified by removePredecessor's PHI simplification
// logic.
Cond = SI->getCondition();
++NumDeadCases;
Changed = true;
} else if (State == LazyValueInfo::True) {
// This case always fires. Arrange for the switch to be turned into an
// unconditional branch by replacing the switch condition with the case
// value.
SI->setCondition(Case);
NumDeadCases += SI->getNumCases();
Changed = true;
break;
}
}
if (Changed)
// If the switch has been simplified to the point where it can be replaced
// by a branch then do so now.
ConstantFoldTerminator(BB);
return Changed;
}
/// Infer nonnull attributes for the arguments at the specified callsite.
static bool processCallSite(CallSite CS, LazyValueInfo *LVI) {
SmallVector<unsigned, 4> Indices;
unsigned ArgNo = 0;
for (Value *V : CS.args()) {
PointerType *Type = dyn_cast<PointerType>(V->getType());
// Try to mark pointer typed parameters as non-null. We skip the
// relatively expensive analysis for constants which are obviously either
// null or non-null to start with.
if (Type && !CS.paramHasAttr(ArgNo + 1, Attribute::NonNull) &&
!isa<Constant>(V) &&
LVI->getPredicateAt(ICmpInst::ICMP_EQ, V,
ConstantPointerNull::get(Type),
CS.getInstruction()) == LazyValueInfo::False)
Indices.push_back(ArgNo + 1);
ArgNo++;
}
assert(ArgNo == CS.arg_size() && "sanity check");
if (Indices.empty())
return false;
AttributeSet AS = CS.getAttributes();
LLVMContext &Ctx = CS.getInstruction()->getContext();
AS = AS.addAttribute(Ctx, Indices, Attribute::get(Ctx, Attribute::NonNull));
CS.setAttributes(AS);
return true;
}
// Helper function to rewrite srem and sdiv. As a policy choice, we choose not
// to waste compile time on anything where the operands are local defs. While
// LVI can sometimes reason about such cases, it's not its primary purpose.
static bool hasLocalDefs(BinaryOperator *SDI) {
for (Value *O : SDI->operands()) {
auto *I = dyn_cast<Instruction>(O);
if (I && I->getParent() == SDI->getParent())
return true;
}
return false;
}
static bool hasPositiveOperands(BinaryOperator *SDI, LazyValueInfo *LVI) {
Constant *Zero = ConstantInt::get(SDI->getType(), 0);
for (Value *O : SDI->operands()) {
auto Result = LVI->getPredicateAt(ICmpInst::ICMP_SGE, O, Zero, SDI);
if (Result != LazyValueInfo::True)
return false;
}
return true;
}
static bool processSRem(BinaryOperator *SDI, LazyValueInfo *LVI) {
if (SDI->getType()->isVectorTy() || hasLocalDefs(SDI) ||
!hasPositiveOperands(SDI, LVI))
return false;
++NumSRems;
auto *BO = BinaryOperator::CreateURem(SDI->getOperand(0), SDI->getOperand(1),
SDI->getName(), SDI);
SDI->replaceAllUsesWith(BO);
SDI->eraseFromParent();
return true;
}
/// See if LazyValueInfo's ability to exploit edge conditions or range
/// information is sufficient to prove the both operands of this SDiv are
/// positive. If this is the case, replace the SDiv with a UDiv. Even for local
/// conditions, this can sometimes prove conditions instcombine can't by
/// exploiting range information.
static bool processSDiv(BinaryOperator *SDI, LazyValueInfo *LVI) {
if (SDI->getType()->isVectorTy() || hasLocalDefs(SDI) ||
!hasPositiveOperands(SDI, LVI))
return false;
++NumSDivs;
auto *BO = BinaryOperator::CreateUDiv(SDI->getOperand(0), SDI->getOperand(1),
SDI->getName(), SDI);
BO->setIsExact(SDI->isExact());
SDI->replaceAllUsesWith(BO);
SDI->eraseFromParent();
return true;
}
static bool processAShr(BinaryOperator *SDI, LazyValueInfo *LVI) {
if (SDI->getType()->isVectorTy() || hasLocalDefs(SDI))
return false;
Constant *Zero = ConstantInt::get(SDI->getType(), 0);
if (LVI->getPredicateAt(ICmpInst::ICMP_SGE, SDI->getOperand(0), Zero, SDI) !=
LazyValueInfo::True)
return false;
++NumAShrs;
auto *BO = BinaryOperator::CreateLShr(SDI->getOperand(0), SDI->getOperand(1),
SDI->getName(), SDI);
BO->setIsExact(SDI->isExact());
SDI->replaceAllUsesWith(BO);
SDI->eraseFromParent();
return true;
}
static bool processAdd(BinaryOperator *AddOp, LazyValueInfo *LVI) {
typedef OverflowingBinaryOperator OBO;
+
+ if (DontProcessAdds)
+ return false;
if (AddOp->getType()->isVectorTy() || hasLocalDefs(AddOp))
return false;
bool NSW = AddOp->hasNoSignedWrap();
bool NUW = AddOp->hasNoUnsignedWrap();
if (NSW && NUW)
return false;
BasicBlock *BB = AddOp->getParent();
Value *LHS = AddOp->getOperand(0);
Value *RHS = AddOp->getOperand(1);
ConstantRange LRange = LVI->getConstantRange(LHS, BB, AddOp);
// Initialize RRange only if we need it. If we know that guaranteed no wrap
// range for the given LHS range is empty don't spend time calculating the
// range for the RHS.
Optional<ConstantRange> RRange;
auto LazyRRange = [&] () {
if (!RRange)
RRange = LVI->getConstantRange(RHS, BB, AddOp);
return RRange.getValue();
};
bool Changed = false;
if (!NUW) {
ConstantRange NUWRange =
LRange.makeGuaranteedNoWrapRegion(BinaryOperator::Add, LRange,
OBO::NoUnsignedWrap);
if (!NUWRange.isEmptySet()) {
bool NewNUW = NUWRange.contains(LazyRRange());
AddOp->setHasNoUnsignedWrap(NewNUW);
Changed |= NewNUW;
}
}
if (!NSW) {
ConstantRange NSWRange =
LRange.makeGuaranteedNoWrapRegion(BinaryOperator::Add, LRange,
OBO::NoSignedWrap);
if (!NSWRange.isEmptySet()) {
bool NewNSW = NSWRange.contains(LazyRRange());
AddOp->setHasNoSignedWrap(NewNSW);
Changed |= NewNSW;
}
}
return Changed;
}
static Constant *getConstantAt(Value *V, Instruction *At, LazyValueInfo *LVI) {
if (Constant *C = LVI->getConstant(V, At->getParent(), At))
return C;
// TODO: The following really should be sunk inside LVI's core algorithm, or
// at least the outer shims around such.
auto *C = dyn_cast<CmpInst>(V);
if (!C) return nullptr;
Value *Op0 = C->getOperand(0);
Constant *Op1 = dyn_cast<Constant>(C->getOperand(1));
if (!Op1) return nullptr;
LazyValueInfo::Tristate Result =
LVI->getPredicateAt(C->getPredicate(), Op0, Op1, At);
if (Result == LazyValueInfo::Unknown)
return nullptr;
return (Result == LazyValueInfo::True) ?
ConstantInt::getTrue(C->getContext()) :
ConstantInt::getFalse(C->getContext());
}
static bool runImpl(Function &F, LazyValueInfo *LVI) {
bool FnChanged = false;
// Visiting in a pre-order depth-first traversal causes us to simplify early
// blocks before querying later blocks (which require us to analyze early
// blocks). Eagerly simplifying shallow blocks means there is strictly less
// work to do for deep blocks. This also means we don't visit unreachable
// blocks.
for (BasicBlock *BB : depth_first(&F.getEntryBlock())) {
bool BBChanged = false;
for (BasicBlock::iterator BI = BB->begin(), BE = BB->end(); BI != BE;) {
Instruction *II = &*BI++;
switch (II->getOpcode()) {
case Instruction::Select:
BBChanged |= processSelect(cast<SelectInst>(II), LVI);
break;
case Instruction::PHI:
BBChanged |= processPHI(cast<PHINode>(II), LVI);
break;
case Instruction::ICmp:
case Instruction::FCmp:
BBChanged |= processCmp(cast<CmpInst>(II), LVI);
break;
case Instruction::Load:
case Instruction::Store:
BBChanged |= processMemAccess(II, LVI);
break;
case Instruction::Call:
case Instruction::Invoke:
BBChanged |= processCallSite(CallSite(II), LVI);
break;
case Instruction::SRem:
BBChanged |= processSRem(cast<BinaryOperator>(II), LVI);
break;
case Instruction::SDiv:
BBChanged |= processSDiv(cast<BinaryOperator>(II), LVI);
break;
case Instruction::AShr:
BBChanged |= processAShr(cast<BinaryOperator>(II), LVI);
break;
case Instruction::Add:
BBChanged |= processAdd(cast<BinaryOperator>(II), LVI);
break;
}
}
Instruction *Term = BB->getTerminator();
switch (Term->getOpcode()) {
case Instruction::Switch:
BBChanged |= processSwitch(cast<SwitchInst>(Term), LVI);
break;
case Instruction::Ret: {
auto *RI = cast<ReturnInst>(Term);
// Try to determine the return value if we can. This is mainly here to
// simplify the writing of unit tests, but also helps to enable IPO by
// constant folding the return values of callees.
auto *RetVal = RI->getReturnValue();
if (!RetVal) break; // handle "ret void"
if (isa<Constant>(RetVal)) break; // nothing to do
if (auto *C = getConstantAt(RetVal, RI, LVI)) {
++NumReturns;
RI->replaceUsesOfWith(RetVal, C);
BBChanged = true;
}
}
};
FnChanged |= BBChanged;
}
return FnChanged;
}
bool CorrelatedValuePropagation::runOnFunction(Function &F) {
if (skipFunction(F))
return false;
LazyValueInfo *LVI = &getAnalysis<LazyValueInfoWrapperPass>().getLVI();
return runImpl(F, LVI);
}
PreservedAnalyses
CorrelatedValuePropagationPass::run(Function &F, FunctionAnalysisManager &AM) {
LazyValueInfo *LVI = &AM.getResult<LazyValueAnalysis>(F);
bool Changed = runImpl(F, LVI);
// FIXME: We need to invalidate LVI to avoid PR28400. Is there a better
// solution?
AM.invalidate<LazyValueAnalysis>(F);
if (!Changed)
return PreservedAnalyses::all();
PreservedAnalyses PA;
PA.preserve<GlobalsAA>();
return PA;
}
Index: projects/clang400-import/contrib/llvm/lib/Transforms/Scalar/Reassociate.cpp
===================================================================
--- projects/clang400-import/contrib/llvm/lib/Transforms/Scalar/Reassociate.cpp (revision 314268)
+++ projects/clang400-import/contrib/llvm/lib/Transforms/Scalar/Reassociate.cpp (revision 314269)
@@ -1,2280 +1,2280 @@
//===- Reassociate.cpp - Reassociate binary expressions -------------------===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
// This pass reassociates commutative expressions in an order that is designed
// to promote better constant propagation, GCSE, LICM, PRE, etc.
//
// For example: 4 + (x + 5) -> x + (4 + 5)
//
// In the implementation of this algorithm, constants are assigned rank = 0,
// function arguments are rank = 1, and other values are assigned ranks
// corresponding to the reverse post order traversal of current function
// (starting at 2), which effectively gives values in deep loops higher rank
// than values not in loops.
//
//===----------------------------------------------------------------------===//
#include "llvm/Transforms/Scalar/Reassociate.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/PostOrderIterator.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SetVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/GlobalsModRef.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/IR/CFG.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/ValueHandle.h"
#include "llvm/Pass.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Utils/Local.h"
#include <algorithm>
using namespace llvm;
using namespace reassociate;
#define DEBUG_TYPE "reassociate"
STATISTIC(NumChanged, "Number of insts reassociated");
STATISTIC(NumAnnihil, "Number of expr tree annihilated");
STATISTIC(NumFactor , "Number of multiplies factored");
#ifndef NDEBUG
/// Print out the expression identified in the Ops list.
///
static void PrintOps(Instruction *I, const SmallVectorImpl<ValueEntry> &Ops) {
Module *M = I->getModule();
dbgs() << Instruction::getOpcodeName(I->getOpcode()) << " "
<< *Ops[0].Op->getType() << '\t';
for (unsigned i = 0, e = Ops.size(); i != e; ++i) {
dbgs() << "[ ";
Ops[i].Op->printAsOperand(dbgs(), false, M);
dbgs() << ", #" << Ops[i].Rank << "] ";
}
}
#endif
/// Utility class representing a non-constant Xor-operand. We classify
/// non-constant Xor-Operands into two categories:
/// C1) The operand is in the form "X & C", where C is a constant and C != ~0
/// C2)
/// C2.1) The operand is in the form of "X | C", where C is a non-zero
/// constant.
/// C2.2) Any operand E which doesn't fall into C1 and C2.1, we view this
/// operand as "E | 0"
class llvm::reassociate::XorOpnd {
public:
XorOpnd(Value *V);
bool isInvalid() const { return SymbolicPart == nullptr; }
bool isOrExpr() const { return isOr; }
Value *getValue() const { return OrigVal; }
Value *getSymbolicPart() const { return SymbolicPart; }
unsigned getSymbolicRank() const { return SymbolicRank; }
const APInt &getConstPart() const { return ConstPart; }
void Invalidate() { SymbolicPart = OrigVal = nullptr; }
void setSymbolicRank(unsigned R) { SymbolicRank = R; }
private:
Value *OrigVal;
Value *SymbolicPart;
APInt ConstPart;
unsigned SymbolicRank;
bool isOr;
};
XorOpnd::XorOpnd(Value *V) {
assert(!isa<ConstantInt>(V) && "No ConstantInt");
OrigVal = V;
Instruction *I = dyn_cast<Instruction>(V);
SymbolicRank = 0;
if (I && (I->getOpcode() == Instruction::Or ||
I->getOpcode() == Instruction::And)) {
Value *V0 = I->getOperand(0);
Value *V1 = I->getOperand(1);
if (isa<ConstantInt>(V0))
std::swap(V0, V1);
if (ConstantInt *C = dyn_cast<ConstantInt>(V1)) {
ConstPart = C->getValue();
SymbolicPart = V0;
isOr = (I->getOpcode() == Instruction::Or);
return;
}
}
// view the operand as "V | 0"
SymbolicPart = V;
ConstPart = APInt::getNullValue(V->getType()->getIntegerBitWidth());
isOr = true;
}
/// Return true if V is an instruction of the specified opcode and if it
/// only has one use.
static BinaryOperator *isReassociableOp(Value *V, unsigned Opcode) {
if (V->hasOneUse() && isa<Instruction>(V) &&
cast<Instruction>(V)->getOpcode() == Opcode &&
(!isa<FPMathOperator>(V) ||
cast<Instruction>(V)->hasUnsafeAlgebra()))
return cast<BinaryOperator>(V);
return nullptr;
}
static BinaryOperator *isReassociableOp(Value *V, unsigned Opcode1,
unsigned Opcode2) {
if (V->hasOneUse() && isa<Instruction>(V) &&
(cast<Instruction>(V)->getOpcode() == Opcode1 ||
cast<Instruction>(V)->getOpcode() == Opcode2) &&
(!isa<FPMathOperator>(V) ||
cast<Instruction>(V)->hasUnsafeAlgebra()))
return cast<BinaryOperator>(V);
return nullptr;
}
void ReassociatePass::BuildRankMap(Function &F,
ReversePostOrderTraversal<Function*> &RPOT) {
unsigned i = 2;
// Assign distinct ranks to function arguments.
for (Function::arg_iterator I = F.arg_begin(), E = F.arg_end(); I != E; ++I) {
ValueRankMap[&*I] = ++i;
DEBUG(dbgs() << "Calculated Rank[" << I->getName() << "] = " << i << "\n");
}
// Traverse basic blocks in ReversePostOrder
for (BasicBlock *BB : RPOT) {
unsigned BBRank = RankMap[BB] = ++i << 16;
// Walk the basic block, adding precomputed ranks for any instructions that
// we cannot move. This ensures that the ranks for these instructions are
// all different in the block.
for (Instruction &I : *BB)
if (mayBeMemoryDependent(I))
ValueRankMap[&I] = ++BBRank;
}
}
unsigned ReassociatePass::getRank(Value *V) {
Instruction *I = dyn_cast<Instruction>(V);
if (!I) {
if (isa<Argument>(V)) return ValueRankMap[V]; // Function argument.
return 0; // Otherwise it's a global or constant, rank 0.
}
if (unsigned Rank = ValueRankMap[I])
return Rank; // Rank already known?
// If this is an expression, return the 1+MAX(rank(LHS), rank(RHS)) so that
// we can reassociate expressions for code motion! Since we do not recurse
// for PHI nodes, we cannot have infinite recursion here, because there
// cannot be loops in the value graph that do not go through PHI nodes.
unsigned Rank = 0, MaxRank = RankMap[I->getParent()];
for (unsigned i = 0, e = I->getNumOperands();
i != e && Rank != MaxRank; ++i)
Rank = std::max(Rank, getRank(I->getOperand(i)));
// If this is a not or neg instruction, do not count it for rank. This
// assures us that X and ~X will have the same rank.
if (!BinaryOperator::isNot(I) && !BinaryOperator::isNeg(I) &&
!BinaryOperator::isFNeg(I))
++Rank;
DEBUG(dbgs() << "Calculated Rank[" << V->getName() << "] = " << Rank << "\n");
return ValueRankMap[I] = Rank;
}
// Canonicalize constants to RHS. Otherwise, sort the operands by rank.
void ReassociatePass::canonicalizeOperands(Instruction *I) {
assert(isa<BinaryOperator>(I) && "Expected binary operator.");
assert(I->isCommutative() && "Expected commutative operator.");
Value *LHS = I->getOperand(0);
Value *RHS = I->getOperand(1);
unsigned LHSRank = getRank(LHS);
unsigned RHSRank = getRank(RHS);
if (isa<Constant>(RHS))
return;
if (isa<Constant>(LHS) || RHSRank < LHSRank)
cast<BinaryOperator>(I)->swapOperands();
}
static BinaryOperator *CreateAdd(Value *S1, Value *S2, const Twine &Name,
Instruction *InsertBefore, Value *FlagsOp) {
if (S1->getType()->isIntOrIntVectorTy())
return BinaryOperator::CreateAdd(S1, S2, Name, InsertBefore);
else {
BinaryOperator *Res =
BinaryOperator::CreateFAdd(S1, S2, Name, InsertBefore);
Res->setFastMathFlags(cast<FPMathOperator>(FlagsOp)->getFastMathFlags());
return Res;
}
}
static BinaryOperator *CreateMul(Value *S1, Value *S2, const Twine &Name,
Instruction *InsertBefore, Value *FlagsOp) {
if (S1->getType()->isIntOrIntVectorTy())
return BinaryOperator::CreateMul(S1, S2, Name, InsertBefore);
else {
BinaryOperator *Res =
BinaryOperator::CreateFMul(S1, S2, Name, InsertBefore);
Res->setFastMathFlags(cast<FPMathOperator>(FlagsOp)->getFastMathFlags());
return Res;
}
}
static BinaryOperator *CreateNeg(Value *S1, const Twine &Name,
Instruction *InsertBefore, Value *FlagsOp) {
if (S1->getType()->isIntOrIntVectorTy())
return BinaryOperator::CreateNeg(S1, Name, InsertBefore);
else {
BinaryOperator *Res = BinaryOperator::CreateFNeg(S1, Name, InsertBefore);
Res->setFastMathFlags(cast<FPMathOperator>(FlagsOp)->getFastMathFlags());
return Res;
}
}
/// Replace 0-X with X*-1.
static BinaryOperator *LowerNegateToMultiply(Instruction *Neg) {
Type *Ty = Neg->getType();
Constant *NegOne = Ty->isIntOrIntVectorTy() ?
ConstantInt::getAllOnesValue(Ty) : ConstantFP::get(Ty, -1.0);
BinaryOperator *Res = CreateMul(Neg->getOperand(1), NegOne, "", Neg, Neg);
Neg->setOperand(1, Constant::getNullValue(Ty)); // Drop use of op.
Res->takeName(Neg);
Neg->replaceAllUsesWith(Res);
Res->setDebugLoc(Neg->getDebugLoc());
return Res;
}
/// Returns k such that lambda(2^Bitwidth) = 2^k, where lambda is the Carmichael
/// function. This means that x^(2^k) === 1 mod 2^Bitwidth for
/// every odd x, i.e. x^(2^k) = 1 for every odd x in Bitwidth-bit arithmetic.
/// Note that 0 <= k < Bitwidth, and if Bitwidth > 3 then x^(2^k) = 0 for every
/// even x in Bitwidth-bit arithmetic.
static unsigned CarmichaelShift(unsigned Bitwidth) {
if (Bitwidth < 3)
return Bitwidth - 1;
return Bitwidth - 2;
}
/// Add the extra weight 'RHS' to the existing weight 'LHS',
/// reducing the combined weight using any special properties of the operation.
/// The existing weight LHS represents the computation X op X op ... op X where
/// X occurs LHS times. The combined weight represents X op X op ... op X with
/// X occurring LHS + RHS times. If op is "Xor" for example then the combined
/// operation is equivalent to X if LHS + RHS is odd, or 0 if LHS + RHS is even;
/// the routine returns 1 in LHS in the first case, and 0 in LHS in the second.
static void IncorporateWeight(APInt &LHS, const APInt &RHS, unsigned Opcode) {
// If we were working with infinite precision arithmetic then the combined
// weight would be LHS + RHS. But we are using finite precision arithmetic,
// and the APInt sum LHS + RHS may not be correct if it wraps (it is correct
// for nilpotent operations and addition, but not for idempotent operations
// and multiplication), so it is important to correctly reduce the combined
// weight back into range if wrapping would be wrong.
// If RHS is zero then the weight didn't change.
if (RHS.isMinValue())
return;
// If LHS is zero then the combined weight is RHS.
if (LHS.isMinValue()) {
LHS = RHS;
return;
}
// From this point on we know that neither LHS nor RHS is zero.
if (Instruction::isIdempotent(Opcode)) {
// Idempotent means X op X === X, so any non-zero weight is equivalent to a
// weight of 1. Keeping weights at zero or one also means that wrapping is
// not a problem.
assert(LHS == 1 && RHS == 1 && "Weights not reduced!");
return; // Return a weight of 1.
}
if (Instruction::isNilpotent(Opcode)) {
// Nilpotent means X op X === 0, so reduce weights modulo 2.
assert(LHS == 1 && RHS == 1 && "Weights not reduced!");
LHS = 0; // 1 + 1 === 0 modulo 2.
return;
}
if (Opcode == Instruction::Add || Opcode == Instruction::FAdd) {
// TODO: Reduce the weight by exploiting nsw/nuw?
LHS += RHS;
return;
}
assert((Opcode == Instruction::Mul || Opcode == Instruction::FMul) &&
"Unknown associative operation!");
unsigned Bitwidth = LHS.getBitWidth();
// If CM is the Carmichael number then a weight W satisfying W >= CM+Bitwidth
// can be replaced with W-CM. That's because x^W=x^(W-CM) for every Bitwidth
// bit number x, since either x is odd in which case x^CM = 1, or x is even in
// which case both x^W and x^(W - CM) are zero. By subtracting off multiples
// of CM like this weights can always be reduced to the range [0, CM+Bitwidth)
// which by a happy accident means that they can always be represented using
// Bitwidth bits.
// TODO: Reduce the weight by exploiting nsw/nuw? (Could do much better than
// the Carmichael number).
if (Bitwidth > 3) {
/// CM - The value of Carmichael's lambda function.
APInt CM = APInt::getOneBitSet(Bitwidth, CarmichaelShift(Bitwidth));
// Any weight W >= Threshold can be replaced with W - CM.
APInt Threshold = CM + Bitwidth;
assert(LHS.ult(Threshold) && RHS.ult(Threshold) && "Weights not reduced!");
// For Bitwidth 4 or more the following sum does not overflow.
LHS += RHS;
while (LHS.uge(Threshold))
LHS -= CM;
} else {
// To avoid problems with overflow do everything the same as above but using
// a larger type.
unsigned CM = 1U << CarmichaelShift(Bitwidth);
unsigned Threshold = CM + Bitwidth;
assert(LHS.getZExtValue() < Threshold && RHS.getZExtValue() < Threshold &&
"Weights not reduced!");
unsigned Total = LHS.getZExtValue() + RHS.getZExtValue();
while (Total >= Threshold)
Total -= CM;
LHS = Total;
}
}
typedef std::pair<Value*, APInt> RepeatedValue;
/// Given an associative binary expression, return the leaf
/// nodes in Ops along with their weights (how many times the leaf occurs). The
/// original expression is the same as
/// (Ops[0].first op Ops[0].first op ... Ops[0].first) <- Ops[0].second times
/// op
/// (Ops[1].first op Ops[1].first op ... Ops[1].first) <- Ops[1].second times
/// op
/// ...
/// op
/// (Ops[N].first op Ops[N].first op ... Ops[N].first) <- Ops[N].second times
///
/// Note that the values Ops[0].first, ..., Ops[N].first are all distinct.
///
/// This routine may modify the function, in which case it returns 'true'. The
/// changes it makes may well be destructive, changing the value computed by 'I'
/// to something completely different. Thus if the routine returns 'true' then
/// you MUST either replace I with a new expression computed from the Ops array,
/// or use RewriteExprTree to put the values back in.
///
/// A leaf node is either not a binary operation of the same kind as the root
/// node 'I' (i.e. is not a binary operator at all, or is, but with a different
/// opcode), or is the same kind of binary operator but has a use which either
/// does not belong to the expression, or does belong to the expression but is
/// a leaf node. Every leaf node has at least one use that is a non-leaf node
/// of the expression, while for non-leaf nodes (except for the root 'I') every
/// use is a non-leaf node of the expression.
///
/// For example:
/// expression graph node names
///
/// + | I
/// / \ |
/// + + | A, B
/// / \ / \ |
/// * + * | C, D, E
/// / \ / \ / \ |
/// + * | F, G
///
/// The leaf nodes are C, E, F and G. The Ops array will contain (maybe not in
/// that order) (C, 1), (E, 1), (F, 2), (G, 2).
///
/// The expression is maximal: if some instruction is a binary operator of the
/// same kind as 'I', and all of its uses are non-leaf nodes of the expression,
/// then the instruction also belongs to the expression, is not a leaf node of
/// it, and its operands also belong to the expression (but may be leaf nodes).
///
/// NOTE: This routine will set operands of non-leaf non-root nodes to undef in
/// order to ensure that every non-root node in the expression has *exactly one*
/// use by a non-leaf node of the expression. This destruction means that the
/// caller MUST either replace 'I' with a new expression or use something like
/// RewriteExprTree to put the values back in if the routine indicates that it
/// made a change by returning 'true'.
///
/// In the above example either the right operand of A or the left operand of B
/// will be replaced by undef. If it is B's operand then this gives:
///
/// + | I
/// / \ |
/// + + | A, B - operand of B replaced with undef
/// / \ \ |
/// * + * | C, D, E
/// / \ / \ / \ |
/// + * | F, G
///
/// Note that such undef operands can only be reached by passing through 'I'.
/// For example, if you visit operands recursively starting from a leaf node
/// then you will never see such an undef operand unless you get back to 'I',
/// which requires passing through a phi node.
///
/// Note that this routine may also mutate binary operators of the wrong type
/// that have all uses inside the expression (i.e. only used by non-leaf nodes
/// of the expression) if it can turn them into binary operators of the right
/// type and thus make the expression bigger.
static bool LinearizeExprTree(BinaryOperator *I,
SmallVectorImpl<RepeatedValue> &Ops) {
DEBUG(dbgs() << "LINEARIZE: " << *I << '\n');
unsigned Bitwidth = I->getType()->getScalarType()->getPrimitiveSizeInBits();
unsigned Opcode = I->getOpcode();
assert(I->isAssociative() && I->isCommutative() &&
"Expected an associative and commutative operation!");
// Visit all operands of the expression, keeping track of their weight (the
// number of paths from the expression root to the operand, or if you like
// the number of times that operand occurs in the linearized expression).
// For example, if I = X + A, where X = A + B, then I, X and B have weight 1
// while A has weight two.
// Worklist of non-leaf nodes (their operands are in the expression too) along
// with their weights, representing a certain number of paths to the operator.
// If an operator occurs in the worklist multiple times then we found multiple
// ways to get to it.
SmallVector<std::pair<BinaryOperator*, APInt>, 8> Worklist; // (Op, Weight)
Worklist.push_back(std::make_pair(I, APInt(Bitwidth, 1)));
bool Changed = false;
// Leaves of the expression are values that either aren't the right kind of
// operation (eg: a constant, or a multiply in an add tree), or are, but have
// some uses that are not inside the expression. For example, in I = X + X,
// X = A + B, the value X has two uses (by I) that are in the expression. If
// X has any other uses, for example in a return instruction, then we consider
// X to be a leaf, and won't analyze it further. When we first visit a value,
// if it has more than one use then at first we conservatively consider it to
// be a leaf. Later, as the expression is explored, we may discover some more
// uses of the value from inside the expression. If all uses turn out to be
// from within the expression (and the value is a binary operator of the right
// kind) then the value is no longer considered to be a leaf, and its operands
// are explored.
// Leaves - Keeps track of the set of putative leaves as well as the number of
// paths to each leaf seen so far.
typedef DenseMap<Value*, APInt> LeafMap;
LeafMap Leaves; // Leaf -> Total weight so far.
SmallVector<Value*, 8> LeafOrder; // Ensure deterministic leaf output order.
#ifndef NDEBUG
SmallPtrSet<Value*, 8> Visited; // For sanity checking the iteration scheme.
#endif
while (!Worklist.empty()) {
std::pair<BinaryOperator*, APInt> P = Worklist.pop_back_val();
I = P.first; // We examine the operands of this binary operator.
for (unsigned OpIdx = 0; OpIdx < 2; ++OpIdx) { // Visit operands.
Value *Op = I->getOperand(OpIdx);
APInt Weight = P.second; // Number of paths to this operand.
DEBUG(dbgs() << "OPERAND: " << *Op << " (" << Weight << ")\n");
assert(!Op->use_empty() && "No uses, so how did we get to it?!");
// If this is a binary operation of the right kind with only one use then
// add its operands to the expression.
if (BinaryOperator *BO = isReassociableOp(Op, Opcode)) {
assert(Visited.insert(Op).second && "Not first visit!");
DEBUG(dbgs() << "DIRECT ADD: " << *Op << " (" << Weight << ")\n");
Worklist.push_back(std::make_pair(BO, Weight));
continue;
}
// Appears to be a leaf. Is the operand already in the set of leaves?
LeafMap::iterator It = Leaves.find(Op);
if (It == Leaves.end()) {
// Not in the leaf map. Must be the first time we saw this operand.
assert(Visited.insert(Op).second && "Not first visit!");
if (!Op->hasOneUse()) {
// This value has uses not accounted for by the expression, so it is
// not safe to modify. Mark it as being a leaf.
DEBUG(dbgs() << "ADD USES LEAF: " << *Op << " (" << Weight << ")\n");
LeafOrder.push_back(Op);
Leaves[Op] = Weight;
continue;
}
// No uses outside the expression, try morphing it.
} else {
// Already in the leaf map.
assert(It != Leaves.end() && Visited.count(Op) &&
"In leaf map but not visited!");
// Update the number of paths to the leaf.
IncorporateWeight(It->second, Weight, Opcode);
#if 0 // TODO: Re-enable once PR13021 is fixed.
// The leaf already has one use from inside the expression. As we want
// exactly one such use, drop this new use of the leaf.
assert(!Op->hasOneUse() && "Only one use, but we got here twice!");
I->setOperand(OpIdx, UndefValue::get(I->getType()));
Changed = true;
// If the leaf is a binary operation of the right kind and we now see
// that its multiple original uses were in fact all by nodes belonging
// to the expression, then no longer consider it to be a leaf and add
// its operands to the expression.
if (BinaryOperator *BO = isReassociableOp(Op, Opcode)) {
DEBUG(dbgs() << "UNLEAF: " << *Op << " (" << It->second << ")\n");
Worklist.push_back(std::make_pair(BO, It->second));
Leaves.erase(It);
continue;
}
#endif
// If we still have uses that are not accounted for by the expression
// then it is not safe to modify the value.
if (!Op->hasOneUse())
continue;
// No uses outside the expression, try morphing it.
Weight = It->second;
Leaves.erase(It); // Since the value may be morphed below.
}
// At this point we have a value which, first of all, is not a binary
// expression of the right kind, and secondly, is only used inside the
// expression. This means that it can safely be modified. See if we
// can usefully morph it into an expression of the right kind.
assert((!isa<Instruction>(Op) ||
cast<Instruction>(Op)->getOpcode() != Opcode
|| (isa<FPMathOperator>(Op) &&
!cast<Instruction>(Op)->hasUnsafeAlgebra())) &&
"Should have been handled above!");
assert(Op->hasOneUse() && "Has uses outside the expression tree!");
// If this is a multiply expression, turn any internal negations into
// multiplies by -1 so they can be reassociated.
if (BinaryOperator *BO = dyn_cast<BinaryOperator>(Op))
if ((Opcode == Instruction::Mul && BinaryOperator::isNeg(BO)) ||
(Opcode == Instruction::FMul && BinaryOperator::isFNeg(BO))) {
DEBUG(dbgs() << "MORPH LEAF: " << *Op << " (" << Weight << ") TO ");
BO = LowerNegateToMultiply(BO);
DEBUG(dbgs() << *BO << '\n');
Worklist.push_back(std::make_pair(BO, Weight));
Changed = true;
continue;
}
// Failed to morph into an expression of the right type. This really is
// a leaf.
DEBUG(dbgs() << "ADD LEAF: " << *Op << " (" << Weight << ")\n");
assert(!isReassociableOp(Op, Opcode) && "Value was morphed?");
LeafOrder.push_back(Op);
Leaves[Op] = Weight;
}
}
// The leaves, repeated according to their weights, represent the linearized
// form of the expression.
for (unsigned i = 0, e = LeafOrder.size(); i != e; ++i) {
Value *V = LeafOrder[i];
LeafMap::iterator It = Leaves.find(V);
if (It == Leaves.end())
// Node initially thought to be a leaf wasn't.
continue;
assert(!isReassociableOp(V, Opcode) && "Shouldn't be a leaf!");
APInt Weight = It->second;
if (Weight.isMinValue())
// Leaf already output or weight reduction eliminated it.
continue;
// Ensure the leaf is only output once.
It->second = 0;
Ops.push_back(std::make_pair(V, Weight));
}
// For nilpotent operations or addition there may be no operands, for example
// because the expression was "X xor X" or consisted of 2^Bitwidth additions:
// in both cases the weight reduces to 0 causing the value to be skipped.
if (Ops.empty()) {
Constant *Identity = ConstantExpr::getBinOpIdentity(Opcode, I->getType());
assert(Identity && "Associative operation without identity!");
Ops.emplace_back(Identity, APInt(Bitwidth, 1));
}
return Changed;
}
/// Now that the operands for this expression tree are
/// linearized and optimized, emit them in-order.
void ReassociatePass::RewriteExprTree(BinaryOperator *I,
SmallVectorImpl<ValueEntry> &Ops) {
assert(Ops.size() > 1 && "Single values should be used directly!");
// Since our optimizations should never increase the number of operations, the
// new expression can usually be written reusing the existing binary operators
// from the original expression tree, without creating any new instructions,
// though the rewritten expression may have a completely different topology.
// We take care to not change anything if the new expression will be the same
// as the original. If more than trivial changes (like commuting operands)
// were made then we are obliged to clear out any optional subclass data like
// nsw flags.
/// NodesToRewrite - Nodes from the original expression available for writing
/// the new expression into.
SmallVector<BinaryOperator*, 8> NodesToRewrite;
unsigned Opcode = I->getOpcode();
BinaryOperator *Op = I;
/// NotRewritable - The operands being written will be the leaves of the new
/// expression and must not be used as inner nodes (via NodesToRewrite) by
/// mistake. Inner nodes are always reassociable, and usually leaves are not
/// (if they were they would have been incorporated into the expression and so
/// would not be leaves), so most of the time there is no danger of this. But
/// in rare cases a leaf may become reassociable if an optimization kills uses
/// of it, or it may momentarily become reassociable during rewriting (below)
/// due it being removed as an operand of one of its uses. Ensure that misuse
/// of leaf nodes as inner nodes cannot occur by remembering all of the future
/// leaves and refusing to reuse any of them as inner nodes.
SmallPtrSet<Value*, 8> NotRewritable;
for (unsigned i = 0, e = Ops.size(); i != e; ++i)
NotRewritable.insert(Ops[i].Op);
// ExpressionChanged - Non-null if the rewritten expression differs from the
// original in some non-trivial way, requiring the clearing of optional flags.
// Flags are cleared from the operator in ExpressionChanged up to I inclusive.
BinaryOperator *ExpressionChanged = nullptr;
for (unsigned i = 0; ; ++i) {
// The last operation (which comes earliest in the IR) is special as both
// operands will come from Ops, rather than just one with the other being
// a subexpression.
if (i+2 == Ops.size()) {
Value *NewLHS = Ops[i].Op;
Value *NewRHS = Ops[i+1].Op;
Value *OldLHS = Op->getOperand(0);
Value *OldRHS = Op->getOperand(1);
if (NewLHS == OldLHS && NewRHS == OldRHS)
// Nothing changed, leave it alone.
break;
if (NewLHS == OldRHS && NewRHS == OldLHS) {
// The order of the operands was reversed. Swap them.
DEBUG(dbgs() << "RA: " << *Op << '\n');
Op->swapOperands();
DEBUG(dbgs() << "TO: " << *Op << '\n');
MadeChange = true;
++NumChanged;
break;
}
// The new operation differs non-trivially from the original. Overwrite
// the old operands with the new ones.
DEBUG(dbgs() << "RA: " << *Op << '\n');
if (NewLHS != OldLHS) {
BinaryOperator *BO = isReassociableOp(OldLHS, Opcode);
if (BO && !NotRewritable.count(BO))
NodesToRewrite.push_back(BO);
Op->setOperand(0, NewLHS);
}
if (NewRHS != OldRHS) {
BinaryOperator *BO = isReassociableOp(OldRHS, Opcode);
if (BO && !NotRewritable.count(BO))
NodesToRewrite.push_back(BO);
Op->setOperand(1, NewRHS);
}
DEBUG(dbgs() << "TO: " << *Op << '\n');
ExpressionChanged = Op;
MadeChange = true;
++NumChanged;
break;
}
// Not the last operation. The left-hand side will be a sub-expression
// while the right-hand side will be the current element of Ops.
Value *NewRHS = Ops[i].Op;
if (NewRHS != Op->getOperand(1)) {
DEBUG(dbgs() << "RA: " << *Op << '\n');
if (NewRHS == Op->getOperand(0)) {
// The new right-hand side was already present as the left operand. If
// we are lucky then swapping the operands will sort out both of them.
Op->swapOperands();
} else {
// Overwrite with the new right-hand side.
BinaryOperator *BO = isReassociableOp(Op->getOperand(1), Opcode);
if (BO && !NotRewritable.count(BO))
NodesToRewrite.push_back(BO);
Op->setOperand(1, NewRHS);
ExpressionChanged = Op;
}
DEBUG(dbgs() << "TO: " << *Op << '\n');
MadeChange = true;
++NumChanged;
}
// Now deal with the left-hand side. If this is already an operation node
// from the original expression then just rewrite the rest of the expression
// into it.
BinaryOperator *BO = isReassociableOp(Op->getOperand(0), Opcode);
if (BO && !NotRewritable.count(BO)) {
Op = BO;
continue;
}
// Otherwise, grab a spare node from the original expression and use that as
// the left-hand side. If there are no nodes left then the optimizers made
// an expression with more nodes than the original! This usually means that
// they did something stupid but it might mean that the problem was just too
// hard (finding the mimimal number of multiplications needed to realize a
// multiplication expression is NP-complete). Whatever the reason, smart or
// stupid, create a new node if there are none left.
BinaryOperator *NewOp;
if (NodesToRewrite.empty()) {
Constant *Undef = UndefValue::get(I->getType());
NewOp = BinaryOperator::Create(Instruction::BinaryOps(Opcode),
Undef, Undef, "", I);
if (NewOp->getType()->isFPOrFPVectorTy())
NewOp->setFastMathFlags(I->getFastMathFlags());
} else {
NewOp = NodesToRewrite.pop_back_val();
}
DEBUG(dbgs() << "RA: " << *Op << '\n');
Op->setOperand(0, NewOp);
DEBUG(dbgs() << "TO: " << *Op << '\n');
ExpressionChanged = Op;
MadeChange = true;
++NumChanged;
Op = NewOp;
}
// If the expression changed non-trivially then clear out all subclass data
// starting from the operator specified in ExpressionChanged, and compactify
// the operators to just before the expression root to guarantee that the
// expression tree is dominated by all of Ops.
if (ExpressionChanged)
do {
// Preserve FastMathFlags.
if (isa<FPMathOperator>(I)) {
FastMathFlags Flags = I->getFastMathFlags();
ExpressionChanged->clearSubclassOptionalData();
ExpressionChanged->setFastMathFlags(Flags);
} else
ExpressionChanged->clearSubclassOptionalData();
if (ExpressionChanged == I)
break;
ExpressionChanged->moveBefore(I);
ExpressionChanged = cast<BinaryOperator>(*ExpressionChanged->user_begin());
} while (1);
// Throw away any left over nodes from the original expression.
for (unsigned i = 0, e = NodesToRewrite.size(); i != e; ++i)
RedoInsts.insert(NodesToRewrite[i]);
}
/// Insert instructions before the instruction pointed to by BI,
/// that computes the negative version of the value specified. The negative
/// version of the value is returned, and BI is left pointing at the instruction
/// that should be processed next by the reassociation pass.
/// Also add intermediate instructions to the redo list that are modified while
/// pushing the negates through adds. These will be revisited to see if
/// additional opportunities have been exposed.
static Value *NegateValue(Value *V, Instruction *BI,
SetVector<AssertingVH<Instruction>> &ToRedo) {
if (Constant *C = dyn_cast<Constant>(V)) {
if (C->getType()->isFPOrFPVectorTy()) {
return ConstantExpr::getFNeg(C);
}
return ConstantExpr::getNeg(C);
}
// We are trying to expose opportunity for reassociation. One of the things
// that we want to do to achieve this is to push a negation as deep into an
// expression chain as possible, to expose the add instructions. In practice,
// this means that we turn this:
// X = -(A+12+C+D) into X = -A + -12 + -C + -D = -12 + -A + -C + -D
// so that later, a: Y = 12+X could get reassociated with the -12 to eliminate
// the constants. We assume that instcombine will clean up the mess later if
// we introduce tons of unnecessary negation instructions.
//
if (BinaryOperator *I =
isReassociableOp(V, Instruction::Add, Instruction::FAdd)) {
// Push the negates through the add.
I->setOperand(0, NegateValue(I->getOperand(0), BI, ToRedo));
I->setOperand(1, NegateValue(I->getOperand(1), BI, ToRedo));
if (I->getOpcode() == Instruction::Add) {
I->setHasNoUnsignedWrap(false);
I->setHasNoSignedWrap(false);
}
// We must move the add instruction here, because the neg instructions do
// not dominate the old add instruction in general. By moving it, we are
// assured that the neg instructions we just inserted dominate the
// instruction we are about to insert after them.
//
I->moveBefore(BI);
I->setName(I->getName()+".neg");
// Add the intermediate negates to the redo list as processing them later
// could expose more reassociating opportunities.
ToRedo.insert(I);
return I;
}
// Okay, we need to materialize a negated version of V with an instruction.
// Scan the use lists of V to see if we have one already.
for (User *U : V->users()) {
if (!BinaryOperator::isNeg(U) && !BinaryOperator::isFNeg(U))
continue;
// We found one! Now we have to make sure that the definition dominates
// this use. We do this by moving it to the entry block (if it is a
// non-instruction value) or right after the definition. These negates will
// be zapped by reassociate later, so we don't need much finesse here.
BinaryOperator *TheNeg = cast<BinaryOperator>(U);
// Verify that the negate is in this function, V might be a constant expr.
if (TheNeg->getParent()->getParent() != BI->getParent()->getParent())
continue;
BasicBlock::iterator InsertPt;
if (Instruction *InstInput = dyn_cast<Instruction>(V)) {
if (InvokeInst *II = dyn_cast<InvokeInst>(InstInput)) {
InsertPt = II->getNormalDest()->begin();
} else {
InsertPt = ++InstInput->getIterator();
}
while (isa<PHINode>(InsertPt)) ++InsertPt;
} else {
InsertPt = TheNeg->getParent()->getParent()->getEntryBlock().begin();
}
TheNeg->moveBefore(&*InsertPt);
if (TheNeg->getOpcode() == Instruction::Sub) {
TheNeg->setHasNoUnsignedWrap(false);
TheNeg->setHasNoSignedWrap(false);
} else {
TheNeg->andIRFlags(BI);
}
ToRedo.insert(TheNeg);
return TheNeg;
}
// Insert a 'neg' instruction that subtracts the value from zero to get the
// negation.
BinaryOperator *NewNeg = CreateNeg(V, V->getName() + ".neg", BI, BI);
ToRedo.insert(NewNeg);
return NewNeg;
}
/// Return true if we should break up this subtract of X-Y into (X + -Y).
static bool ShouldBreakUpSubtract(Instruction *Sub) {
// If this is a negation, we can't split it up!
if (BinaryOperator::isNeg(Sub) || BinaryOperator::isFNeg(Sub))
return false;
// Don't breakup X - undef.
if (isa<UndefValue>(Sub->getOperand(1)))
return false;
// Don't bother to break this up unless either the LHS is an associable add or
// subtract or if this is only used by one.
Value *V0 = Sub->getOperand(0);
if (isReassociableOp(V0, Instruction::Add, Instruction::FAdd) ||
isReassociableOp(V0, Instruction::Sub, Instruction::FSub))
return true;
Value *V1 = Sub->getOperand(1);
if (isReassociableOp(V1, Instruction::Add, Instruction::FAdd) ||
isReassociableOp(V1, Instruction::Sub, Instruction::FSub))
return true;
Value *VB = Sub->user_back();
if (Sub->hasOneUse() &&
(isReassociableOp(VB, Instruction::Add, Instruction::FAdd) ||
isReassociableOp(VB, Instruction::Sub, Instruction::FSub)))
return true;
return false;
}
/// If we have (X-Y), and if either X is an add, or if this is only used by an
/// add, transform this into (X+(0-Y)) to promote better reassociation.
static BinaryOperator *
BreakUpSubtract(Instruction *Sub, SetVector<AssertingVH<Instruction>> &ToRedo) {
// Convert a subtract into an add and a neg instruction. This allows sub
// instructions to be commuted with other add instructions.
//
// Calculate the negative value of Operand 1 of the sub instruction,
// and set it as the RHS of the add instruction we just made.
//
Value *NegVal = NegateValue(Sub->getOperand(1), Sub, ToRedo);
BinaryOperator *New = CreateAdd(Sub->getOperand(0), NegVal, "", Sub, Sub);
Sub->setOperand(0, Constant::getNullValue(Sub->getType())); // Drop use of op.
Sub->setOperand(1, Constant::getNullValue(Sub->getType())); // Drop use of op.
New->takeName(Sub);
// Everyone now refers to the add instruction.
Sub->replaceAllUsesWith(New);
New->setDebugLoc(Sub->getDebugLoc());
DEBUG(dbgs() << "Negated: " << *New << '\n');
return New;
}
/// If this is a shift of a reassociable multiply or is used by one, change
/// this into a multiply by a constant to assist with further reassociation.
static BinaryOperator *ConvertShiftToMul(Instruction *Shl) {
Constant *MulCst = ConstantInt::get(Shl->getType(), 1);
MulCst = ConstantExpr::getShl(MulCst, cast<Constant>(Shl->getOperand(1)));
BinaryOperator *Mul =
BinaryOperator::CreateMul(Shl->getOperand(0), MulCst, "", Shl);
Shl->setOperand(0, UndefValue::get(Shl->getType())); // Drop use of op.
Mul->takeName(Shl);
// Everyone now refers to the mul instruction.
Shl->replaceAllUsesWith(Mul);
Mul->setDebugLoc(Shl->getDebugLoc());
// We can safely preserve the nuw flag in all cases. It's also safe to turn a
// nuw nsw shl into a nuw nsw mul. However, nsw in isolation requires special
// handling.
bool NSW = cast<BinaryOperator>(Shl)->hasNoSignedWrap();
bool NUW = cast<BinaryOperator>(Shl)->hasNoUnsignedWrap();
if (NSW && NUW)
Mul->setHasNoSignedWrap(true);
Mul->setHasNoUnsignedWrap(NUW);
return Mul;
}
/// Scan backwards and forwards among values with the same rank as element i
/// to see if X exists. If X does not exist, return i. This is useful when
/// scanning for 'x' when we see '-x' because they both get the same rank.
static unsigned FindInOperandList(SmallVectorImpl<ValueEntry> &Ops, unsigned i,
Value *X) {
unsigned XRank = Ops[i].Rank;
unsigned e = Ops.size();
for (unsigned j = i+1; j != e && Ops[j].Rank == XRank; ++j) {
if (Ops[j].Op == X)
return j;
if (Instruction *I1 = dyn_cast<Instruction>(Ops[j].Op))
if (Instruction *I2 = dyn_cast<Instruction>(X))
if (I1->isIdenticalTo(I2))
return j;
}
// Scan backwards.
for (unsigned j = i-1; j != ~0U && Ops[j].Rank == XRank; --j) {
if (Ops[j].Op == X)
return j;
if (Instruction *I1 = dyn_cast<Instruction>(Ops[j].Op))
if (Instruction *I2 = dyn_cast<Instruction>(X))
if (I1->isIdenticalTo(I2))
return j;
}
return i;
}
/// Emit a tree of add instructions, summing Ops together
/// and returning the result. Insert the tree before I.
static Value *EmitAddTreeOfValues(Instruction *I,
SmallVectorImpl<WeakVH> &Ops){
if (Ops.size() == 1) return Ops.back();
Value *V1 = Ops.back();
Ops.pop_back();
Value *V2 = EmitAddTreeOfValues(I, Ops);
return CreateAdd(V2, V1, "tmp", I, I);
}
/// If V is an expression tree that is a multiplication sequence,
/// and if this sequence contains a multiply by Factor,
/// remove Factor from the tree and return the new tree.
Value *ReassociatePass::RemoveFactorFromExpression(Value *V, Value *Factor) {
BinaryOperator *BO = isReassociableOp(V, Instruction::Mul, Instruction::FMul);
if (!BO)
return nullptr;
SmallVector<RepeatedValue, 8> Tree;
MadeChange |= LinearizeExprTree(BO, Tree);
SmallVector<ValueEntry, 8> Factors;
Factors.reserve(Tree.size());
for (unsigned i = 0, e = Tree.size(); i != e; ++i) {
RepeatedValue E = Tree[i];
Factors.append(E.second.getZExtValue(),
ValueEntry(getRank(E.first), E.first));
}
bool FoundFactor = false;
bool NeedsNegate = false;
for (unsigned i = 0, e = Factors.size(); i != e; ++i) {
if (Factors[i].Op == Factor) {
FoundFactor = true;
Factors.erase(Factors.begin()+i);
break;
}
// If this is a negative version of this factor, remove it.
if (ConstantInt *FC1 = dyn_cast<ConstantInt>(Factor)) {
if (ConstantInt *FC2 = dyn_cast<ConstantInt>(Factors[i].Op))
if (FC1->getValue() == -FC2->getValue()) {
FoundFactor = NeedsNegate = true;
Factors.erase(Factors.begin()+i);
break;
}
} else if (ConstantFP *FC1 = dyn_cast<ConstantFP>(Factor)) {
if (ConstantFP *FC2 = dyn_cast<ConstantFP>(Factors[i].Op)) {
const APFloat &F1 = FC1->getValueAPF();
APFloat F2(FC2->getValueAPF());
F2.changeSign();
if (F1.compare(F2) == APFloat::cmpEqual) {
FoundFactor = NeedsNegate = true;
Factors.erase(Factors.begin() + i);
break;
}
}
}
}
if (!FoundFactor) {
// Make sure to restore the operands to the expression tree.
RewriteExprTree(BO, Factors);
return nullptr;
}
BasicBlock::iterator InsertPt = ++BO->getIterator();
// If this was just a single multiply, remove the multiply and return the only
// remaining operand.
if (Factors.size() == 1) {
RedoInsts.insert(BO);
V = Factors[0].Op;
} else {
RewriteExprTree(BO, Factors);
V = BO;
}
if (NeedsNegate)
V = CreateNeg(V, "neg", &*InsertPt, BO);
return V;
}
/// If V is a single-use multiply, recursively add its operands as factors,
/// otherwise add V to the list of factors.
///
/// Ops is the top-level list of add operands we're trying to factor.
static void FindSingleUseMultiplyFactors(Value *V,
SmallVectorImpl<Value*> &Factors,
const SmallVectorImpl<ValueEntry> &Ops) {
BinaryOperator *BO = isReassociableOp(V, Instruction::Mul, Instruction::FMul);
if (!BO) {
Factors.push_back(V);
return;
}
// Otherwise, add the LHS and RHS to the list of factors.
FindSingleUseMultiplyFactors(BO->getOperand(1), Factors, Ops);
FindSingleUseMultiplyFactors(BO->getOperand(0), Factors, Ops);
}
/// Optimize a series of operands to an 'and', 'or', or 'xor' instruction.
/// This optimizes based on identities. If it can be reduced to a single Value,
/// it is returned, otherwise the Ops list is mutated as necessary.
static Value *OptimizeAndOrXor(unsigned Opcode,
SmallVectorImpl<ValueEntry> &Ops) {
// Scan the operand lists looking for X and ~X pairs, along with X,X pairs.
// If we find any, we can simplify the expression. X&~X == 0, X|~X == -1.
for (unsigned i = 0, e = Ops.size(); i != e; ++i) {
// First, check for X and ~X in the operand list.
assert(i < Ops.size());
if (BinaryOperator::isNot(Ops[i].Op)) { // Cannot occur for ^.
Value *X = BinaryOperator::getNotArgument(Ops[i].Op);
unsigned FoundX = FindInOperandList(Ops, i, X);
if (FoundX != i) {
if (Opcode == Instruction::And) // ...&X&~X = 0
return Constant::getNullValue(X->getType());
if (Opcode == Instruction::Or) // ...|X|~X = -1
return Constant::getAllOnesValue(X->getType());
}
}
// Next, check for duplicate pairs of values, which we assume are next to
// each other, due to our sorting criteria.
assert(i < Ops.size());
if (i+1 != Ops.size() && Ops[i+1].Op == Ops[i].Op) {
if (Opcode == Instruction::And || Opcode == Instruction::Or) {
// Drop duplicate values for And and Or.
Ops.erase(Ops.begin()+i);
--i; --e;
++NumAnnihil;
continue;
}
// Drop pairs of values for Xor.
assert(Opcode == Instruction::Xor);
if (e == 2)
return Constant::getNullValue(Ops[0].Op->getType());
// Y ^ X^X -> Y
Ops.erase(Ops.begin()+i, Ops.begin()+i+2);
i -= 1; e -= 2;
++NumAnnihil;
}
}
return nullptr;
}
/// Helper function of CombineXorOpnd(). It creates a bitwise-and
/// instruction with the given two operands, and return the resulting
/// instruction. There are two special cases: 1) if the constant operand is 0,
/// it will return NULL. 2) if the constant is ~0, the symbolic operand will
/// be returned.
static Value *createAndInstr(Instruction *InsertBefore, Value *Opnd,
const APInt &ConstOpnd) {
if (ConstOpnd != 0) {
if (!ConstOpnd.isAllOnesValue()) {
LLVMContext &Ctx = Opnd->getType()->getContext();
Instruction *I;
I = BinaryOperator::CreateAnd(Opnd, ConstantInt::get(Ctx, ConstOpnd),
"and.ra", InsertBefore);
I->setDebugLoc(InsertBefore->getDebugLoc());
return I;
}
return Opnd;
}
return nullptr;
}
// Helper function of OptimizeXor(). It tries to simplify "Opnd1 ^ ConstOpnd"
// into "R ^ C", where C would be 0, and R is a symbolic value.
//
// If it was successful, true is returned, and the "R" and "C" is returned
// via "Res" and "ConstOpnd", respectively; otherwise, false is returned,
// and both "Res" and "ConstOpnd" remain unchanged.
//
bool ReassociatePass::CombineXorOpnd(Instruction *I, XorOpnd *Opnd1,
APInt &ConstOpnd, Value *&Res) {
// Xor-Rule 1: (x | c1) ^ c2 = (x | c1) ^ (c1 ^ c1) ^ c2
// = ((x | c1) ^ c1) ^ (c1 ^ c2)
// = (x & ~c1) ^ (c1 ^ c2)
// It is useful only when c1 == c2.
if (Opnd1->isOrExpr() && Opnd1->getConstPart() != 0) {
if (!Opnd1->getValue()->hasOneUse())
return false;
const APInt &C1 = Opnd1->getConstPart();
if (C1 != ConstOpnd)
return false;
Value *X = Opnd1->getSymbolicPart();
Res = createAndInstr(I, X, ~C1);
// ConstOpnd was C2, now C1 ^ C2.
ConstOpnd ^= C1;
if (Instruction *T = dyn_cast<Instruction>(Opnd1->getValue()))
RedoInsts.insert(T);
return true;
}
return false;
}
// Helper function of OptimizeXor(). It tries to simplify
// "Opnd1 ^ Opnd2 ^ ConstOpnd" into "R ^ C", where C would be 0, and R is a
// symbolic value.
//
// If it was successful, true is returned, and the "R" and "C" is returned
// via "Res" and "ConstOpnd", respectively (If the entire expression is
// evaluated to a constant, the Res is set to NULL); otherwise, false is
// returned, and both "Res" and "ConstOpnd" remain unchanged.
bool ReassociatePass::CombineXorOpnd(Instruction *I, XorOpnd *Opnd1,
XorOpnd *Opnd2, APInt &ConstOpnd,
Value *&Res) {
Value *X = Opnd1->getSymbolicPart();
if (X != Opnd2->getSymbolicPart())
return false;
// This many instruction become dead.(At least "Opnd1 ^ Opnd2" will die.)
int DeadInstNum = 1;
if (Opnd1->getValue()->hasOneUse())
DeadInstNum++;
if (Opnd2->getValue()->hasOneUse())
DeadInstNum++;
// Xor-Rule 2:
// (x | c1) ^ (x & c2)
// = (x|c1) ^ (x&c2) ^ (c1 ^ c1) = ((x|c1) ^ c1) ^ (x & c2) ^ c1
// = (x & ~c1) ^ (x & c2) ^ c1 // Xor-Rule 1
// = (x & c3) ^ c1, where c3 = ~c1 ^ c2 // Xor-rule 3
//
if (Opnd1->isOrExpr() != Opnd2->isOrExpr()) {
if (Opnd2->isOrExpr())
std::swap(Opnd1, Opnd2);
const APInt &C1 = Opnd1->getConstPart();
const APInt &C2 = Opnd2->getConstPart();
APInt C3((~C1) ^ C2);
// Do not increase code size!
if (C3 != 0 && !C3.isAllOnesValue()) {
int NewInstNum = ConstOpnd != 0 ? 1 : 2;
if (NewInstNum > DeadInstNum)
return false;
}
Res = createAndInstr(I, X, C3);
ConstOpnd ^= C1;
} else if (Opnd1->isOrExpr()) {
// Xor-Rule 3: (x | c1) ^ (x | c2) = (x & c3) ^ c3 where c3 = c1 ^ c2
//
const APInt &C1 = Opnd1->getConstPart();
const APInt &C2 = Opnd2->getConstPart();
APInt C3 = C1 ^ C2;
// Do not increase code size
if (C3 != 0 && !C3.isAllOnesValue()) {
int NewInstNum = ConstOpnd != 0 ? 1 : 2;
if (NewInstNum > DeadInstNum)
return false;
}
Res = createAndInstr(I, X, C3);
ConstOpnd ^= C3;
} else {
// Xor-Rule 4: (x & c1) ^ (x & c2) = (x & (c1^c2))
//
const APInt &C1 = Opnd1->getConstPart();
const APInt &C2 = Opnd2->getConstPart();
APInt C3 = C1 ^ C2;
Res = createAndInstr(I, X, C3);
}
// Put the original operands in the Redo list; hope they will be deleted
// as dead code.
if (Instruction *T = dyn_cast<Instruction>(Opnd1->getValue()))
RedoInsts.insert(T);
if (Instruction *T = dyn_cast<Instruction>(Opnd2->getValue()))
RedoInsts.insert(T);
return true;
}
/// Optimize a series of operands to an 'xor' instruction. If it can be reduced
/// to a single Value, it is returned, otherwise the Ops list is mutated as
/// necessary.
Value *ReassociatePass::OptimizeXor(Instruction *I,
SmallVectorImpl<ValueEntry> &Ops) {
if (Value *V = OptimizeAndOrXor(Instruction::Xor, Ops))
return V;
if (Ops.size() == 1)
return nullptr;
SmallVector<XorOpnd, 8> Opnds;
SmallVector<XorOpnd*, 8> OpndPtrs;
Type *Ty = Ops[0].Op->getType();
APInt ConstOpnd(Ty->getIntegerBitWidth(), 0);
// Step 1: Convert ValueEntry to XorOpnd
for (unsigned i = 0, e = Ops.size(); i != e; ++i) {
Value *V = Ops[i].Op;
if (!isa<ConstantInt>(V)) {
XorOpnd O(V);
O.setSymbolicRank(getRank(O.getSymbolicPart()));
Opnds.push_back(O);
} else
ConstOpnd ^= cast<ConstantInt>(V)->getValue();
}
// NOTE: From this point on, do *NOT* add/delete element to/from "Opnds".
// It would otherwise invalidate the "Opnds"'s iterator, and hence invalidate
// the "OpndPtrs" as well. For the similar reason, do not fuse this loop
// with the previous loop --- the iterator of the "Opnds" may be invalidated
// when new elements are added to the vector.
for (unsigned i = 0, e = Opnds.size(); i != e; ++i)
OpndPtrs.push_back(&Opnds[i]);
// Step 2: Sort the Xor-Operands in a way such that the operands containing
// the same symbolic value cluster together. For instance, the input operand
// sequence ("x | 123", "y & 456", "x & 789") will be sorted into:
// ("x | 123", "x & 789", "y & 456").
//
// The purpose is twofold:
// 1) Cluster together the operands sharing the same symbolic-value.
// 2) Operand having smaller symbolic-value-rank is permuted earlier, which
// could potentially shorten crital path, and expose more loop-invariants.
// Note that values' rank are basically defined in RPO order (FIXME).
// So, if Rank(X) < Rank(Y) < Rank(Z), it means X is defined earlier
// than Y which is defined earlier than Z. Permute "x | 1", "Y & 2",
// "z" in the order of X-Y-Z is better than any other orders.
std::stable_sort(OpndPtrs.begin(), OpndPtrs.end(),
[](XorOpnd *LHS, XorOpnd *RHS) {
return LHS->getSymbolicRank() < RHS->getSymbolicRank();
});
// Step 3: Combine adjacent operands
XorOpnd *PrevOpnd = nullptr;
bool Changed = false;
for (unsigned i = 0, e = Opnds.size(); i < e; i++) {
XorOpnd *CurrOpnd = OpndPtrs[i];
// The combined value
Value *CV;
// Step 3.1: Try simplifying "CurrOpnd ^ ConstOpnd"
if (ConstOpnd != 0 && CombineXorOpnd(I, CurrOpnd, ConstOpnd, CV)) {
Changed = true;
if (CV)
*CurrOpnd = XorOpnd(CV);
else {
CurrOpnd->Invalidate();
continue;
}
}
if (!PrevOpnd || CurrOpnd->getSymbolicPart() != PrevOpnd->getSymbolicPart()) {
PrevOpnd = CurrOpnd;
continue;
}
// step 3.2: When previous and current operands share the same symbolic
// value, try to simplify "PrevOpnd ^ CurrOpnd ^ ConstOpnd"
//
if (CombineXorOpnd(I, CurrOpnd, PrevOpnd, ConstOpnd, CV)) {
// Remove previous operand
PrevOpnd->Invalidate();
if (CV) {
*CurrOpnd = XorOpnd(CV);
PrevOpnd = CurrOpnd;
} else {
CurrOpnd->Invalidate();
PrevOpnd = nullptr;
}
Changed = true;
}
}
// Step 4: Reassemble the Ops
if (Changed) {
Ops.clear();
for (unsigned int i = 0, e = Opnds.size(); i < e; i++) {
XorOpnd &O = Opnds[i];
if (O.isInvalid())
continue;
ValueEntry VE(getRank(O.getValue()), O.getValue());
Ops.push_back(VE);
}
if (ConstOpnd != 0) {
Value *C = ConstantInt::get(Ty->getContext(), ConstOpnd);
ValueEntry VE(getRank(C), C);
Ops.push_back(VE);
}
int Sz = Ops.size();
if (Sz == 1)
return Ops.back().Op;
else if (Sz == 0) {
assert(ConstOpnd == 0);
return ConstantInt::get(Ty->getContext(), ConstOpnd);
}
}
return nullptr;
}
/// Optimize a series of operands to an 'add' instruction. This
/// optimizes based on identities. If it can be reduced to a single Value, it
/// is returned, otherwise the Ops list is mutated as necessary.
Value *ReassociatePass::OptimizeAdd(Instruction *I,
SmallVectorImpl<ValueEntry> &Ops) {
// Scan the operand lists looking for X and -X pairs. If we find any, we
// can simplify expressions like X+-X == 0 and X+~X ==-1. While we're at it,
// scan for any
// duplicates. We want to canonicalize Y+Y+Y+Z -> 3*Y+Z.
for (unsigned i = 0, e = Ops.size(); i != e; ++i) {
Value *TheOp = Ops[i].Op;
// Check to see if we've seen this operand before. If so, we factor all
// instances of the operand together. Due to our sorting criteria, we know
// that these need to be next to each other in the vector.
if (i+1 != Ops.size() && Ops[i+1].Op == TheOp) {
// Rescan the list, remove all instances of this operand from the expr.
unsigned NumFound = 0;
do {
Ops.erase(Ops.begin()+i);
++NumFound;
} while (i != Ops.size() && Ops[i].Op == TheOp);
DEBUG(dbgs() << "\nFACTORING [" << NumFound << "]: " << *TheOp << '\n');
++NumFactor;
// Insert a new multiply.
Type *Ty = TheOp->getType();
Constant *C = Ty->isIntOrIntVectorTy() ?
ConstantInt::get(Ty, NumFound) : ConstantFP::get(Ty, NumFound);
Instruction *Mul = CreateMul(TheOp, C, "factor", I, I);
// Now that we have inserted a multiply, optimize it. This allows us to
// handle cases that require multiple factoring steps, such as this:
// (X*2) + (X*2) + (X*2) -> (X*2)*3 -> X*6
RedoInsts.insert(Mul);
// If every add operand was a duplicate, return the multiply.
if (Ops.empty())
return Mul;
// Otherwise, we had some input that didn't have the dupe, such as
// "A + A + B" -> "A*2 + B". Add the new multiply to the list of
// things being added by this operation.
Ops.insert(Ops.begin(), ValueEntry(getRank(Mul), Mul));
--i;
e = Ops.size();
continue;
}
// Check for X and -X or X and ~X in the operand list.
if (!BinaryOperator::isNeg(TheOp) && !BinaryOperator::isFNeg(TheOp) &&
!BinaryOperator::isNot(TheOp))
continue;
Value *X = nullptr;
if (BinaryOperator::isNeg(TheOp) || BinaryOperator::isFNeg(TheOp))
X = BinaryOperator::getNegArgument(TheOp);
else if (BinaryOperator::isNot(TheOp))
X = BinaryOperator::getNotArgument(TheOp);
unsigned FoundX = FindInOperandList(Ops, i, X);
if (FoundX == i)
continue;
// Remove X and -X from the operand list.
if (Ops.size() == 2 &&
(BinaryOperator::isNeg(TheOp) || BinaryOperator::isFNeg(TheOp)))
return Constant::getNullValue(X->getType());
// Remove X and ~X from the operand list.
if (Ops.size() == 2 && BinaryOperator::isNot(TheOp))
return Constant::getAllOnesValue(X->getType());
Ops.erase(Ops.begin()+i);
if (i < FoundX)
--FoundX;
else
--i; // Need to back up an extra one.
Ops.erase(Ops.begin()+FoundX);
++NumAnnihil;
--i; // Revisit element.
e -= 2; // Removed two elements.
// if X and ~X we append -1 to the operand list.
if (BinaryOperator::isNot(TheOp)) {
Value *V = Constant::getAllOnesValue(X->getType());
Ops.insert(Ops.end(), ValueEntry(getRank(V), V));
e += 1;
}
}
// Scan the operand list, checking to see if there are any common factors
// between operands. Consider something like A*A+A*B*C+D. We would like to
// reassociate this to A*(A+B*C)+D, which reduces the number of multiplies.
// To efficiently find this, we count the number of times a factor occurs
// for any ADD operands that are MULs.
DenseMap<Value*, unsigned> FactorOccurrences;
// Keep track of each multiply we see, to avoid triggering on (X*4)+(X*4)
// where they are actually the same multiply.
unsigned MaxOcc = 0;
Value *MaxOccVal = nullptr;
for (unsigned i = 0, e = Ops.size(); i != e; ++i) {
BinaryOperator *BOp =
isReassociableOp(Ops[i].Op, Instruction::Mul, Instruction::FMul);
if (!BOp)
continue;
// Compute all of the factors of this added value.
SmallVector<Value*, 8> Factors;
FindSingleUseMultiplyFactors(BOp, Factors, Ops);
assert(Factors.size() > 1 && "Bad linearize!");
// Add one to FactorOccurrences for each unique factor in this op.
SmallPtrSet<Value*, 8> Duplicates;
for (unsigned i = 0, e = Factors.size(); i != e; ++i) {
Value *Factor = Factors[i];
if (!Duplicates.insert(Factor).second)
continue;
unsigned Occ = ++FactorOccurrences[Factor];
if (Occ > MaxOcc) {
MaxOcc = Occ;
MaxOccVal = Factor;
}
// If Factor is a negative constant, add the negated value as a factor
// because we can percolate the negate out. Watch for minint, which
// cannot be positivified.
if (ConstantInt *CI = dyn_cast<ConstantInt>(Factor)) {
if (CI->isNegative() && !CI->isMinValue(true)) {
Factor = ConstantInt::get(CI->getContext(), -CI->getValue());
- assert(!Duplicates.count(Factor) &&
- "Shouldn't have two constant factors, missed a canonicalize");
+ if (!Duplicates.insert(Factor).second)
+ continue;
unsigned Occ = ++FactorOccurrences[Factor];
if (Occ > MaxOcc) {
MaxOcc = Occ;
MaxOccVal = Factor;
}
}
} else if (ConstantFP *CF = dyn_cast<ConstantFP>(Factor)) {
if (CF->isNegative()) {
APFloat F(CF->getValueAPF());
F.changeSign();
Factor = ConstantFP::get(CF->getContext(), F);
- assert(!Duplicates.count(Factor) &&
- "Shouldn't have two constant factors, missed a canonicalize");
+ if (!Duplicates.insert(Factor).second)
+ continue;
unsigned Occ = ++FactorOccurrences[Factor];
if (Occ > MaxOcc) {
MaxOcc = Occ;
MaxOccVal = Factor;
}
}
}
}
}
// If any factor occurred more than one time, we can pull it out.
if (MaxOcc > 1) {
DEBUG(dbgs() << "\nFACTORING [" << MaxOcc << "]: " << *MaxOccVal << '\n');
++NumFactor;
// Create a new instruction that uses the MaxOccVal twice. If we don't do
// this, we could otherwise run into situations where removing a factor
// from an expression will drop a use of maxocc, and this can cause
// RemoveFactorFromExpression on successive values to behave differently.
Instruction *DummyInst =
I->getType()->isIntOrIntVectorTy()
? BinaryOperator::CreateAdd(MaxOccVal, MaxOccVal)
: BinaryOperator::CreateFAdd(MaxOccVal, MaxOccVal);
SmallVector<WeakVH, 4> NewMulOps;
for (unsigned i = 0; i != Ops.size(); ++i) {
// Only try to remove factors from expressions we're allowed to.
BinaryOperator *BOp =
isReassociableOp(Ops[i].Op, Instruction::Mul, Instruction::FMul);
if (!BOp)
continue;
if (Value *V = RemoveFactorFromExpression(Ops[i].Op, MaxOccVal)) {
// The factorized operand may occur several times. Convert them all in
// one fell swoop.
for (unsigned j = Ops.size(); j != i;) {
--j;
if (Ops[j].Op == Ops[i].Op) {
NewMulOps.push_back(V);
Ops.erase(Ops.begin()+j);
}
}
--i;
}
}
// No need for extra uses anymore.
delete DummyInst;
unsigned NumAddedValues = NewMulOps.size();
Value *V = EmitAddTreeOfValues(I, NewMulOps);
// Now that we have inserted the add tree, optimize it. This allows us to
// handle cases that require multiple factoring steps, such as this:
// A*A*B + A*A*C --> A*(A*B+A*C) --> A*(A*(B+C))
assert(NumAddedValues > 1 && "Each occurrence should contribute a value");
(void)NumAddedValues;
if (Instruction *VI = dyn_cast<Instruction>(V))
RedoInsts.insert(VI);
// Create the multiply.
Instruction *V2 = CreateMul(V, MaxOccVal, "tmp", I, I);
// Rerun associate on the multiply in case the inner expression turned into
// a multiply. We want to make sure that we keep things in canonical form.
RedoInsts.insert(V2);
// If every add operand included the factor (e.g. "A*B + A*C"), then the
// entire result expression is just the multiply "A*(B+C)".
if (Ops.empty())
return V2;
// Otherwise, we had some input that didn't have the factor, such as
// "A*B + A*C + D" -> "A*(B+C) + D". Add the new multiply to the list of
// things being added by this operation.
Ops.insert(Ops.begin(), ValueEntry(getRank(V2), V2));
}
return nullptr;
}
/// \brief Build up a vector of value/power pairs factoring a product.
///
/// Given a series of multiplication operands, build a vector of factors and
/// the powers each is raised to when forming the final product. Sort them in
/// the order of descending power.
///
/// (x*x) -> [(x, 2)]
/// ((x*x)*x) -> [(x, 3)]
/// ((((x*y)*x)*y)*x) -> [(x, 3), (y, 2)]
///
/// \returns Whether any factors have a power greater than one.
bool ReassociatePass::collectMultiplyFactors(SmallVectorImpl<ValueEntry> &Ops,
SmallVectorImpl<Factor> &Factors) {
// FIXME: Have Ops be (ValueEntry, Multiplicity) pairs, simplifying this.
// Compute the sum of powers of simplifiable factors.
unsigned FactorPowerSum = 0;
for (unsigned Idx = 1, Size = Ops.size(); Idx < Size; ++Idx) {
Value *Op = Ops[Idx-1].Op;
// Count the number of occurrences of this value.
unsigned Count = 1;
for (; Idx < Size && Ops[Idx].Op == Op; ++Idx)
++Count;
// Track for simplification all factors which occur 2 or more times.
if (Count > 1)
FactorPowerSum += Count;
}
// We can only simplify factors if the sum of the powers of our simplifiable
// factors is 4 or higher. When that is the case, we will *always* have
// a simplification. This is an important invariant to prevent cyclicly
// trying to simplify already minimal formations.
if (FactorPowerSum < 4)
return false;
// Now gather the simplifiable factors, removing them from Ops.
FactorPowerSum = 0;
for (unsigned Idx = 1; Idx < Ops.size(); ++Idx) {
Value *Op = Ops[Idx-1].Op;
// Count the number of occurrences of this value.
unsigned Count = 1;
for (; Idx < Ops.size() && Ops[Idx].Op == Op; ++Idx)
++Count;
if (Count == 1)
continue;
// Move an even number of occurrences to Factors.
Count &= ~1U;
Idx -= Count;
FactorPowerSum += Count;
Factors.push_back(Factor(Op, Count));
Ops.erase(Ops.begin()+Idx, Ops.begin()+Idx+Count);
}
// None of the adjustments above should have reduced the sum of factor powers
// below our mininum of '4'.
assert(FactorPowerSum >= 4);
std::stable_sort(Factors.begin(), Factors.end(),
[](const Factor &LHS, const Factor &RHS) {
return LHS.Power > RHS.Power;
});
return true;
}
/// \brief Build a tree of multiplies, computing the product of Ops.
static Value *buildMultiplyTree(IRBuilder<> &Builder,
SmallVectorImpl<Value*> &Ops) {
if (Ops.size() == 1)
return Ops.back();
Value *LHS = Ops.pop_back_val();
do {
if (LHS->getType()->isIntOrIntVectorTy())
LHS = Builder.CreateMul(LHS, Ops.pop_back_val());
else
LHS = Builder.CreateFMul(LHS, Ops.pop_back_val());
} while (!Ops.empty());
return LHS;
}
/// \brief Build a minimal multiplication DAG for (a^x)*(b^y)*(c^z)*...
///
/// Given a vector of values raised to various powers, where no two values are
/// equal and the powers are sorted in decreasing order, compute the minimal
/// DAG of multiplies to compute the final product, and return that product
/// value.
Value *
ReassociatePass::buildMinimalMultiplyDAG(IRBuilder<> &Builder,
SmallVectorImpl<Factor> &Factors) {
assert(Factors[0].Power);
SmallVector<Value *, 4> OuterProduct;
for (unsigned LastIdx = 0, Idx = 1, Size = Factors.size();
Idx < Size && Factors[Idx].Power > 0; ++Idx) {
if (Factors[Idx].Power != Factors[LastIdx].Power) {
LastIdx = Idx;
continue;
}
// We want to multiply across all the factors with the same power so that
// we can raise them to that power as a single entity. Build a mini tree
// for that.
SmallVector<Value *, 4> InnerProduct;
InnerProduct.push_back(Factors[LastIdx].Base);
do {
InnerProduct.push_back(Factors[Idx].Base);
++Idx;
} while (Idx < Size && Factors[Idx].Power == Factors[LastIdx].Power);
// Reset the base value of the first factor to the new expression tree.
// We'll remove all the factors with the same power in a second pass.
Value *M = Factors[LastIdx].Base = buildMultiplyTree(Builder, InnerProduct);
if (Instruction *MI = dyn_cast<Instruction>(M))
RedoInsts.insert(MI);
LastIdx = Idx;
}
// Unique factors with equal powers -- we've folded them into the first one's
// base.
Factors.erase(std::unique(Factors.begin(), Factors.end(),
[](const Factor &LHS, const Factor &RHS) {
return LHS.Power == RHS.Power;
}),
Factors.end());
// Iteratively collect the base of each factor with an add power into the
// outer product, and halve each power in preparation for squaring the
// expression.
for (unsigned Idx = 0, Size = Factors.size(); Idx != Size; ++Idx) {
if (Factors[Idx].Power & 1)
OuterProduct.push_back(Factors[Idx].Base);
Factors[Idx].Power >>= 1;
}
if (Factors[0].Power) {
Value *SquareRoot = buildMinimalMultiplyDAG(Builder, Factors);
OuterProduct.push_back(SquareRoot);
OuterProduct.push_back(SquareRoot);
}
if (OuterProduct.size() == 1)
return OuterProduct.front();
Value *V = buildMultiplyTree(Builder, OuterProduct);
return V;
}
Value *ReassociatePass::OptimizeMul(BinaryOperator *I,
SmallVectorImpl<ValueEntry> &Ops) {
// We can only optimize the multiplies when there is a chain of more than
// three, such that a balanced tree might require fewer total multiplies.
if (Ops.size() < 4)
return nullptr;
// Try to turn linear trees of multiplies without other uses of the
// intermediate stages into minimal multiply DAGs with perfect sub-expression
// re-use.
SmallVector<Factor, 4> Factors;
if (!collectMultiplyFactors(Ops, Factors))
return nullptr; // All distinct factors, so nothing left for us to do.
IRBuilder<> Builder(I);
// The reassociate transformation for FP operations is performed only
// if unsafe algebra is permitted by FastMathFlags. Propagate those flags
// to the newly generated operations.
if (auto FPI = dyn_cast<FPMathOperator>(I))
Builder.setFastMathFlags(FPI->getFastMathFlags());
Value *V = buildMinimalMultiplyDAG(Builder, Factors);
if (Ops.empty())
return V;
ValueEntry NewEntry = ValueEntry(getRank(V), V);
Ops.insert(std::lower_bound(Ops.begin(), Ops.end(), NewEntry), NewEntry);
return nullptr;
}
Value *ReassociatePass::OptimizeExpression(BinaryOperator *I,
SmallVectorImpl<ValueEntry> &Ops) {
// Now that we have the linearized expression tree, try to optimize it.
// Start by folding any constants that we found.
Constant *Cst = nullptr;
unsigned Opcode = I->getOpcode();
while (!Ops.empty() && isa<Constant>(Ops.back().Op)) {
Constant *C = cast<Constant>(Ops.pop_back_val().Op);
Cst = Cst ? ConstantExpr::get(Opcode, C, Cst) : C;
}
// If there was nothing but constants then we are done.
if (Ops.empty())
return Cst;
// Put the combined constant back at the end of the operand list, except if
// there is no point. For example, an add of 0 gets dropped here, while a
// multiplication by zero turns the whole expression into zero.
if (Cst && Cst != ConstantExpr::getBinOpIdentity(Opcode, I->getType())) {
if (Cst == ConstantExpr::getBinOpAbsorber(Opcode, I->getType()))
return Cst;
Ops.push_back(ValueEntry(0, Cst));
}
if (Ops.size() == 1) return Ops[0].Op;
// Handle destructive annihilation due to identities between elements in the
// argument list here.
unsigned NumOps = Ops.size();
switch (Opcode) {
default: break;
case Instruction::And:
case Instruction::Or:
if (Value *Result = OptimizeAndOrXor(Opcode, Ops))
return Result;
break;
case Instruction::Xor:
if (Value *Result = OptimizeXor(I, Ops))
return Result;
break;
case Instruction::Add:
case Instruction::FAdd:
if (Value *Result = OptimizeAdd(I, Ops))
return Result;
break;
case Instruction::Mul:
case Instruction::FMul:
if (Value *Result = OptimizeMul(I, Ops))
return Result;
break;
}
if (Ops.size() != NumOps)
return OptimizeExpression(I, Ops);
return nullptr;
}
// Remove dead instructions and if any operands are trivially dead add them to
// Insts so they will be removed as well.
void ReassociatePass::RecursivelyEraseDeadInsts(
Instruction *I, SetVector<AssertingVH<Instruction>> &Insts) {
assert(isInstructionTriviallyDead(I) && "Trivially dead instructions only!");
SmallVector<Value *, 4> Ops(I->op_begin(), I->op_end());
ValueRankMap.erase(I);
Insts.remove(I);
RedoInsts.remove(I);
I->eraseFromParent();
for (auto Op : Ops)
if (Instruction *OpInst = dyn_cast<Instruction>(Op))
if (OpInst->use_empty())
Insts.insert(OpInst);
}
/// Zap the given instruction, adding interesting operands to the work list.
void ReassociatePass::EraseInst(Instruction *I) {
assert(isInstructionTriviallyDead(I) && "Trivially dead instructions only!");
DEBUG(dbgs() << "Erasing dead inst: "; I->dump());
SmallVector<Value*, 8> Ops(I->op_begin(), I->op_end());
// Erase the dead instruction.
ValueRankMap.erase(I);
RedoInsts.remove(I);
I->eraseFromParent();
// Optimize its operands.
SmallPtrSet<Instruction *, 8> Visited; // Detect self-referential nodes.
for (unsigned i = 0, e = Ops.size(); i != e; ++i)
if (Instruction *Op = dyn_cast<Instruction>(Ops[i])) {
// If this is a node in an expression tree, climb to the expression root
// and add that since that's where optimization actually happens.
unsigned Opcode = Op->getOpcode();
while (Op->hasOneUse() && Op->user_back()->getOpcode() == Opcode &&
Visited.insert(Op).second)
Op = Op->user_back();
RedoInsts.insert(Op);
}
}
// Canonicalize expressions of the following form:
// x + (-Constant * y) -> x - (Constant * y)
// x - (-Constant * y) -> x + (Constant * y)
Instruction *ReassociatePass::canonicalizeNegConstExpr(Instruction *I) {
if (!I->hasOneUse() || I->getType()->isVectorTy())
return nullptr;
// Must be a fmul or fdiv instruction.
unsigned Opcode = I->getOpcode();
if (Opcode != Instruction::FMul && Opcode != Instruction::FDiv)
return nullptr;
auto *C0 = dyn_cast<ConstantFP>(I->getOperand(0));
auto *C1 = dyn_cast<ConstantFP>(I->getOperand(1));
// Both operands are constant, let it get constant folded away.
if (C0 && C1)
return nullptr;
ConstantFP *CF = C0 ? C0 : C1;
// Must have one constant operand.
if (!CF)
return nullptr;
// Must be a negative ConstantFP.
if (!CF->isNegative())
return nullptr;
// User must be a binary operator with one or more uses.
Instruction *User = I->user_back();
if (!isa<BinaryOperator>(User) || !User->hasNUsesOrMore(1))
return nullptr;
unsigned UserOpcode = User->getOpcode();
if (UserOpcode != Instruction::FAdd && UserOpcode != Instruction::FSub)
return nullptr;
// Subtraction is not commutative. Explicitly, the following transform is
// not valid: (-Constant * y) - x -> x + (Constant * y)
if (!User->isCommutative() && User->getOperand(1) != I)
return nullptr;
// Change the sign of the constant.
APFloat Val = CF->getValueAPF();
Val.changeSign();
I->setOperand(C0 ? 0 : 1, ConstantFP::get(CF->getContext(), Val));
// Canonicalize I to RHS to simplify the next bit of logic. E.g.,
// ((-Const*y) + x) -> (x + (-Const*y)).
if (User->getOperand(0) == I && User->isCommutative())
cast<BinaryOperator>(User)->swapOperands();
Value *Op0 = User->getOperand(0);
Value *Op1 = User->getOperand(1);
BinaryOperator *NI;
switch (UserOpcode) {
default:
llvm_unreachable("Unexpected Opcode!");
case Instruction::FAdd:
NI = BinaryOperator::CreateFSub(Op0, Op1);
NI->setFastMathFlags(cast<FPMathOperator>(User)->getFastMathFlags());
break;
case Instruction::FSub:
NI = BinaryOperator::CreateFAdd(Op0, Op1);
NI->setFastMathFlags(cast<FPMathOperator>(User)->getFastMathFlags());
break;
}
NI->insertBefore(User);
NI->setName(User->getName());
User->replaceAllUsesWith(NI);
NI->setDebugLoc(I->getDebugLoc());
RedoInsts.insert(I);
MadeChange = true;
return NI;
}
/// Inspect and optimize the given instruction. Note that erasing
/// instructions is not allowed.
void ReassociatePass::OptimizeInst(Instruction *I) {
// Only consider operations that we understand.
if (!isa<BinaryOperator>(I))
return;
if (I->getOpcode() == Instruction::Shl && isa<ConstantInt>(I->getOperand(1)))
// If an operand of this shift is a reassociable multiply, or if the shift
// is used by a reassociable multiply or add, turn into a multiply.
if (isReassociableOp(I->getOperand(0), Instruction::Mul) ||
(I->hasOneUse() &&
(isReassociableOp(I->user_back(), Instruction::Mul) ||
isReassociableOp(I->user_back(), Instruction::Add)))) {
Instruction *NI = ConvertShiftToMul(I);
RedoInsts.insert(I);
MadeChange = true;
I = NI;
}
// Canonicalize negative constants out of expressions.
if (Instruction *Res = canonicalizeNegConstExpr(I))
I = Res;
// Commute binary operators, to canonicalize the order of their operands.
// This can potentially expose more CSE opportunities, and makes writing other
// transformations simpler.
if (I->isCommutative())
canonicalizeOperands(I);
// TODO: We should optimize vector Xor instructions, but they are
// currently unsupported.
if (I->getType()->isVectorTy() && I->getOpcode() == Instruction::Xor)
return;
// Don't optimize floating point instructions that don't have unsafe algebra.
if (I->getType()->isFPOrFPVectorTy() && !I->hasUnsafeAlgebra())
return;
// Do not reassociate boolean (i1) expressions. We want to preserve the
// original order of evaluation for short-circuited comparisons that
// SimplifyCFG has folded to AND/OR expressions. If the expression
// is not further optimized, it is likely to be transformed back to a
// short-circuited form for code gen, and the source order may have been
// optimized for the most likely conditions.
if (I->getType()->isIntegerTy(1))
return;
// If this is a subtract instruction which is not already in negate form,
// see if we can convert it to X+-Y.
if (I->getOpcode() == Instruction::Sub) {
if (ShouldBreakUpSubtract(I)) {
Instruction *NI = BreakUpSubtract(I, RedoInsts);
RedoInsts.insert(I);
MadeChange = true;
I = NI;
} else if (BinaryOperator::isNeg(I)) {
// Otherwise, this is a negation. See if the operand is a multiply tree
// and if this is not an inner node of a multiply tree.
if (isReassociableOp(I->getOperand(1), Instruction::Mul) &&
(!I->hasOneUse() ||
!isReassociableOp(I->user_back(), Instruction::Mul))) {
Instruction *NI = LowerNegateToMultiply(I);
// If the negate was simplified, revisit the users to see if we can
// reassociate further.
for (User *U : NI->users()) {
if (BinaryOperator *Tmp = dyn_cast<BinaryOperator>(U))
RedoInsts.insert(Tmp);
}
RedoInsts.insert(I);
MadeChange = true;
I = NI;
}
}
} else if (I->getOpcode() == Instruction::FSub) {
if (ShouldBreakUpSubtract(I)) {
Instruction *NI = BreakUpSubtract(I, RedoInsts);
RedoInsts.insert(I);
MadeChange = true;
I = NI;
} else if (BinaryOperator::isFNeg(I)) {
// Otherwise, this is a negation. See if the operand is a multiply tree
// and if this is not an inner node of a multiply tree.
if (isReassociableOp(I->getOperand(1), Instruction::FMul) &&
(!I->hasOneUse() ||
!isReassociableOp(I->user_back(), Instruction::FMul))) {
// If the negate was simplified, revisit the users to see if we can
// reassociate further.
Instruction *NI = LowerNegateToMultiply(I);
for (User *U : NI->users()) {
if (BinaryOperator *Tmp = dyn_cast<BinaryOperator>(U))
RedoInsts.insert(Tmp);
}
RedoInsts.insert(I);
MadeChange = true;
I = NI;
}
}
}
// If this instruction is an associative binary operator, process it.
if (!I->isAssociative()) return;
BinaryOperator *BO = cast<BinaryOperator>(I);
// If this is an interior node of a reassociable tree, ignore it until we
// get to the root of the tree, to avoid N^2 analysis.
unsigned Opcode = BO->getOpcode();
if (BO->hasOneUse() && BO->user_back()->getOpcode() == Opcode) {
// During the initial run we will get to the root of the tree.
// But if we get here while we are redoing instructions, there is no
// guarantee that the root will be visited. So Redo later
if (BO->user_back() != BO &&
BO->getParent() == BO->user_back()->getParent())
RedoInsts.insert(BO->user_back());
return;
}
// If this is an add tree that is used by a sub instruction, ignore it
// until we process the subtract.
if (BO->hasOneUse() && BO->getOpcode() == Instruction::Add &&
cast<Instruction>(BO->user_back())->getOpcode() == Instruction::Sub)
return;
if (BO->hasOneUse() && BO->getOpcode() == Instruction::FAdd &&
cast<Instruction>(BO->user_back())->getOpcode() == Instruction::FSub)
return;
ReassociateExpression(BO);
}
void ReassociatePass::ReassociateExpression(BinaryOperator *I) {
// First, walk the expression tree, linearizing the tree, collecting the
// operand information.
SmallVector<RepeatedValue, 8> Tree;
MadeChange |= LinearizeExprTree(I, Tree);
SmallVector<ValueEntry, 8> Ops;
Ops.reserve(Tree.size());
for (unsigned i = 0, e = Tree.size(); i != e; ++i) {
RepeatedValue E = Tree[i];
Ops.append(E.second.getZExtValue(),
ValueEntry(getRank(E.first), E.first));
}
DEBUG(dbgs() << "RAIn:\t"; PrintOps(I, Ops); dbgs() << '\n');
// Now that we have linearized the tree to a list and have gathered all of
// the operands and their ranks, sort the operands by their rank. Use a
// stable_sort so that values with equal ranks will have their relative
// positions maintained (and so the compiler is deterministic). Note that
// this sorts so that the highest ranking values end up at the beginning of
// the vector.
std::stable_sort(Ops.begin(), Ops.end());
// Now that we have the expression tree in a convenient
// sorted form, optimize it globally if possible.
if (Value *V = OptimizeExpression(I, Ops)) {
if (V == I)
// Self-referential expression in unreachable code.
return;
// This expression tree simplified to something that isn't a tree,
// eliminate it.
DEBUG(dbgs() << "Reassoc to scalar: " << *V << '\n');
I->replaceAllUsesWith(V);
if (Instruction *VI = dyn_cast<Instruction>(V))
VI->setDebugLoc(I->getDebugLoc());
RedoInsts.insert(I);
++NumAnnihil;
return;
}
// We want to sink immediates as deeply as possible except in the case where
// this is a multiply tree used only by an add, and the immediate is a -1.
// In this case we reassociate to put the negation on the outside so that we
// can fold the negation into the add: (-X)*Y + Z -> Z-X*Y
if (I->hasOneUse()) {
if (I->getOpcode() == Instruction::Mul &&
cast<Instruction>(I->user_back())->getOpcode() == Instruction::Add &&
isa<ConstantInt>(Ops.back().Op) &&
cast<ConstantInt>(Ops.back().Op)->isAllOnesValue()) {
ValueEntry Tmp = Ops.pop_back_val();
Ops.insert(Ops.begin(), Tmp);
} else if (I->getOpcode() == Instruction::FMul &&
cast<Instruction>(I->user_back())->getOpcode() ==
Instruction::FAdd &&
isa<ConstantFP>(Ops.back().Op) &&
cast<ConstantFP>(Ops.back().Op)->isExactlyValue(-1.0)) {
ValueEntry Tmp = Ops.pop_back_val();
Ops.insert(Ops.begin(), Tmp);
}
}
DEBUG(dbgs() << "RAOut:\t"; PrintOps(I, Ops); dbgs() << '\n');
if (Ops.size() == 1) {
if (Ops[0].Op == I)
// Self-referential expression in unreachable code.
return;
// This expression tree simplified to something that isn't a tree,
// eliminate it.
I->replaceAllUsesWith(Ops[0].Op);
if (Instruction *OI = dyn_cast<Instruction>(Ops[0].Op))
OI->setDebugLoc(I->getDebugLoc());
RedoInsts.insert(I);
return;
}
// Now that we ordered and optimized the expressions, splat them back into
// the expression tree, removing any unneeded nodes.
RewriteExprTree(I, Ops);
}
PreservedAnalyses ReassociatePass::run(Function &F, FunctionAnalysisManager &) {
// Get the functions basic blocks in Reverse Post Order. This order is used by
// BuildRankMap to pre calculate ranks correctly. It also excludes dead basic
// blocks (it has been seen that the analysis in this pass could hang when
// analysing dead basic blocks).
ReversePostOrderTraversal<Function *> RPOT(&F);
// Calculate the rank map for F.
BuildRankMap(F, RPOT);
MadeChange = false;
// Traverse the same blocks that was analysed by BuildRankMap.
for (BasicBlock *BI : RPOT) {
assert(RankMap.count(&*BI) && "BB should be ranked.");
// Optimize every instruction in the basic block.
for (BasicBlock::iterator II = BI->begin(), IE = BI->end(); II != IE;)
if (isInstructionTriviallyDead(&*II)) {
EraseInst(&*II++);
} else {
OptimizeInst(&*II);
assert(II->getParent() == &*BI && "Moved to a different block!");
++II;
}
// Make a copy of all the instructions to be redone so we can remove dead
// instructions.
SetVector<AssertingVH<Instruction>> ToRedo(RedoInsts);
// Iterate over all instructions to be reevaluated and remove trivially dead
// instructions. If any operand of the trivially dead instruction becomes
// dead mark it for deletion as well. Continue this process until all
// trivially dead instructions have been removed.
while (!ToRedo.empty()) {
Instruction *I = ToRedo.pop_back_val();
if (isInstructionTriviallyDead(I)) {
RecursivelyEraseDeadInsts(I, ToRedo);
MadeChange = true;
}
}
// Now that we have removed dead instructions, we can reoptimize the
// remaining instructions.
while (!RedoInsts.empty()) {
Instruction *I = RedoInsts.pop_back_val();
if (isInstructionTriviallyDead(I))
EraseInst(I);
else
OptimizeInst(I);
}
}
// We are done with the rank map.
RankMap.clear();
ValueRankMap.clear();
if (MadeChange) {
// FIXME: This should also 'preserve the CFG'.
auto PA = PreservedAnalyses();
PA.preserve<GlobalsAA>();
return PA;
}
return PreservedAnalyses::all();
}
namespace {
class ReassociateLegacyPass : public FunctionPass {
ReassociatePass Impl;
public:
static char ID; // Pass identification, replacement for typeid
ReassociateLegacyPass() : FunctionPass(ID) {
initializeReassociateLegacyPassPass(*PassRegistry::getPassRegistry());
}
bool runOnFunction(Function &F) override {
if (skipFunction(F))
return false;
FunctionAnalysisManager DummyFAM;
auto PA = Impl.run(F, DummyFAM);
return !PA.areAllPreserved();
}
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesCFG();
AU.addPreserved<GlobalsAAWrapperPass>();
}
};
}
char ReassociateLegacyPass::ID = 0;
INITIALIZE_PASS(ReassociateLegacyPass, "reassociate",
"Reassociate expressions", false, false)
// Public interface to the Reassociate pass
FunctionPass *llvm::createReassociatePass() {
return new ReassociateLegacyPass();
}
Index: projects/clang400-import/contrib/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
===================================================================
--- projects/clang400-import/contrib/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp (revision 314268)
+++ projects/clang400-import/contrib/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp (revision 314269)
@@ -1,4945 +1,4854 @@
//===- SLPVectorizer.cpp - A bottom up SLP Vectorizer ---------------------===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
// This pass implements the Bottom Up SLP vectorizer. It detects consecutive
// stores that can be put together into vector-stores. Next, it attempts to
// construct vectorizable tree using the use-def chains. If a profitable tree
// was found, the SLP vectorizer performs vectorization on the tree.
//
// The pass is inspired by the work described in the paper:
// "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks.
//
//===----------------------------------------------------------------------===//
#include "llvm/Transforms/Vectorize/SLPVectorizer.h"
#include "llvm/ADT/Optional.h"
#include "llvm/ADT/PostOrderIterator.h"
#include "llvm/ADT/SetVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/CodeMetrics.h"
#include "llvm/Analysis/GlobalsModRef.h"
#include "llvm/Analysis/LoopAccessAnalysis.h"
#include "llvm/Analysis/ScalarEvolutionExpressions.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/Analysis/VectorUtils.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/Dominators.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/NoFolder.h"
#include "llvm/IR/Type.h"
#include "llvm/IR/Value.h"
#include "llvm/IR/Verifier.h"
#include "llvm/Pass.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Vectorize.h"
#include <algorithm>
#include <memory>
using namespace llvm;
using namespace slpvectorizer;
#define SV_NAME "slp-vectorizer"
#define DEBUG_TYPE "SLP"
STATISTIC(NumVectorInstructions, "Number of vector instructions generated");
static cl::opt<int>
SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden,
cl::desc("Only vectorize if you gain more than this "
"number "));
static cl::opt<bool>
ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden,
cl::desc("Attempt to vectorize horizontal reductions"));
static cl::opt<bool> ShouldStartVectorizeHorAtStore(
"slp-vectorize-hor-store", cl::init(false), cl::Hidden,
cl::desc(
"Attempt to vectorize horizontal reductions feeding into a store"));
static cl::opt<int>
MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden,
cl::desc("Attempt to vectorize for this register size in bits"));
/// Limits the size of scheduling regions in a block.
/// It avoid long compile times for _very_ large blocks where vector
/// instructions are spread over a wide range.
/// This limit is way higher than needed by real-world functions.
static cl::opt<int>
ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden,
cl::desc("Limit the size of the SLP scheduling region per block"));
static cl::opt<int> MinVectorRegSizeOption(
"slp-min-reg-size", cl::init(128), cl::Hidden,
cl::desc("Attempt to vectorize for this register size in bits"));
static cl::opt<unsigned> RecursionMaxDepth(
"slp-recursion-max-depth", cl::init(12), cl::Hidden,
cl::desc("Limit the recursion depth when building a vectorizable tree"));
static cl::opt<unsigned> MinTreeSize(
"slp-min-tree-size", cl::init(3), cl::Hidden,
cl::desc("Only vectorize small trees if they are fully vectorizable"));
// Limit the number of alias checks. The limit is chosen so that
// it has no negative effect on the llvm benchmarks.
static const unsigned AliasedCheckLimit = 10;
// Another limit for the alias checks: The maximum distance between load/store
// instructions where alias checks are done.
// This limit is useful for very large basic blocks.
static const unsigned MaxMemDepDistance = 160;
/// If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling
/// regions to be handled.
static const int MinScheduleRegionSize = 16;
/// \brief Predicate for the element types that the SLP vectorizer supports.
///
/// The most important thing to filter here are types which are invalid in LLVM
/// vectors. We also filter target specific types which have absolutely no
/// meaningful vectorization path such as x86_fp80 and ppc_f128. This just
/// avoids spending time checking the cost model and realizing that they will
/// be inevitably scalarized.
static bool isValidElementType(Type *Ty) {
return VectorType::isValidElementType(Ty) && !Ty->isX86_FP80Ty() &&
!Ty->isPPC_FP128Ty();
}
/// \returns true if all of the instructions in \p VL are in the same block or
/// false otherwise.
static bool allSameBlock(ArrayRef<Value *> VL) {
Instruction *I0 = dyn_cast<Instruction>(VL[0]);
if (!I0)
return false;
BasicBlock *BB = I0->getParent();
for (int i = 1, e = VL.size(); i < e; i++) {
Instruction *I = dyn_cast<Instruction>(VL[i]);
if (!I)
return false;
if (BB != I->getParent())
return false;
}
return true;
}
/// \returns True if all of the values in \p VL are constants.
static bool allConstant(ArrayRef<Value *> VL) {
for (Value *i : VL)
if (!isa<Constant>(i))
return false;
return true;
}
/// \returns True if all of the values in \p VL are identical.
static bool isSplat(ArrayRef<Value *> VL) {
for (unsigned i = 1, e = VL.size(); i < e; ++i)
if (VL[i] != VL[0])
return false;
return true;
}
///\returns Opcode that can be clubbed with \p Op to create an alternate
/// sequence which can later be merged as a ShuffleVector instruction.
static unsigned getAltOpcode(unsigned Op) {
switch (Op) {
case Instruction::FAdd:
return Instruction::FSub;
case Instruction::FSub:
return Instruction::FAdd;
case Instruction::Add:
return Instruction::Sub;
case Instruction::Sub:
return Instruction::Add;
default:
return 0;
}
}
///\returns bool representing if Opcode \p Op can be part
/// of an alternate sequence which can later be merged as
/// a ShuffleVector instruction.
static bool canCombineAsAltInst(unsigned Op) {
return Op == Instruction::FAdd || Op == Instruction::FSub ||
Op == Instruction::Sub || Op == Instruction::Add;
}
/// \returns ShuffleVector instruction if instructions in \p VL have
/// alternate fadd,fsub / fsub,fadd/add,sub/sub,add sequence.
/// (i.e. e.g. opcodes of fadd,fsub,fadd,fsub...)
static unsigned isAltInst(ArrayRef<Value *> VL) {
Instruction *I0 = dyn_cast<Instruction>(VL[0]);
unsigned Opcode = I0->getOpcode();
unsigned AltOpcode = getAltOpcode(Opcode);
for (int i = 1, e = VL.size(); i < e; i++) {
Instruction *I = dyn_cast<Instruction>(VL[i]);
if (!I || I->getOpcode() != ((i & 1) ? AltOpcode : Opcode))
return 0;
}
return Instruction::ShuffleVector;
}
/// \returns The opcode if all of the Instructions in \p VL have the same
/// opcode, or zero.
static unsigned getSameOpcode(ArrayRef<Value *> VL) {
Instruction *I0 = dyn_cast<Instruction>(VL[0]);
if (!I0)
return 0;
unsigned Opcode = I0->getOpcode();
for (int i = 1, e = VL.size(); i < e; i++) {
Instruction *I = dyn_cast<Instruction>(VL[i]);
if (!I || Opcode != I->getOpcode()) {
if (canCombineAsAltInst(Opcode) && i == 1)
return isAltInst(VL);
return 0;
}
}
return Opcode;
}
/// Get the intersection (logical and) of all of the potential IR flags
/// of each scalar operation (VL) that will be converted into a vector (I).
/// Flag set: NSW, NUW, exact, and all of fast-math.
static void propagateIRFlags(Value *I, ArrayRef<Value *> VL) {
if (auto *VecOp = dyn_cast<Instruction>(I)) {
if (auto *Intersection = dyn_cast<Instruction>(VL[0])) {
// Intersection is initialized to the 0th scalar,
// so start counting from index '1'.
for (int i = 1, e = VL.size(); i < e; ++i) {
if (auto *Scalar = dyn_cast<Instruction>(VL[i]))
Intersection->andIRFlags(Scalar);
}
VecOp->copyIRFlags(Intersection);
}
}
}
/// \returns true if all of the values in \p VL have the same type or false
/// otherwise.
static bool allSameType(ArrayRef<Value *> VL) {
Type *Ty = VL[0]->getType();
for (int i = 1, e = VL.size(); i < e; i++)
if (VL[i]->getType() != Ty)
return false;
return true;
}
/// \returns True if Extract{Value,Element} instruction extracts element Idx.
static bool matchExtractIndex(Instruction *E, unsigned Idx, unsigned Opcode) {
assert(Opcode == Instruction::ExtractElement ||
Opcode == Instruction::ExtractValue);
if (Opcode == Instruction::ExtractElement) {
ConstantInt *CI = dyn_cast<ConstantInt>(E->getOperand(1));
return CI && CI->getZExtValue() == Idx;
} else {
ExtractValueInst *EI = cast<ExtractValueInst>(E);
return EI->getNumIndices() == 1 && *EI->idx_begin() == Idx;
}
}
/// \returns True if in-tree use also needs extract. This refers to
/// possible scalar operand in vectorized instruction.
static bool InTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,
TargetLibraryInfo *TLI) {
unsigned Opcode = UserInst->getOpcode();
switch (Opcode) {
case Instruction::Load: {
LoadInst *LI = cast<LoadInst>(UserInst);
return (LI->getPointerOperand() == Scalar);
}
case Instruction::Store: {
StoreInst *SI = cast<StoreInst>(UserInst);
return (SI->getPointerOperand() == Scalar);
}
case Instruction::Call: {
CallInst *CI = cast<CallInst>(UserInst);
Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
if (hasVectorInstrinsicScalarOpd(ID, 1)) {
return (CI->getArgOperand(1) == Scalar);
}
}
default:
return false;
}
}
/// \returns the AA location that is being access by the instruction.
static MemoryLocation getLocation(Instruction *I, AliasAnalysis *AA) {
if (StoreInst *SI = dyn_cast<StoreInst>(I))
return MemoryLocation::get(SI);
if (LoadInst *LI = dyn_cast<LoadInst>(I))
return MemoryLocation::get(LI);
return MemoryLocation();
}
/// \returns True if the instruction is not a volatile or atomic load/store.
static bool isSimple(Instruction *I) {
if (LoadInst *LI = dyn_cast<LoadInst>(I))
return LI->isSimple();
if (StoreInst *SI = dyn_cast<StoreInst>(I))
return SI->isSimple();
if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I))
return !MI->isVolatile();
return true;
}
namespace llvm {
namespace slpvectorizer {
/// Bottom Up SLP Vectorizer.
class BoUpSLP {
public:
typedef SmallVector<Value *, 8> ValueList;
typedef SmallVector<Instruction *, 16> InstrList;
typedef SmallPtrSet<Value *, 16> ValueSet;
typedef SmallVector<StoreInst *, 8> StoreList;
BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti,
TargetLibraryInfo *TLi, AliasAnalysis *Aa, LoopInfo *Li,
DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB,
const DataLayout *DL)
: NumLoadsWantToKeepOrder(0), NumLoadsWantToChangeOrder(0), F(Func),
SE(Se), TTI(Tti), TLI(TLi), AA(Aa), LI(Li), DT(Dt), AC(AC), DB(DB),
DL(DL), Builder(Se->getContext()) {
CodeMetrics::collectEphemeralValues(F, AC, EphValues);
// Use the vector register size specified by the target unless overridden
// by a command-line option.
// TODO: It would be better to limit the vectorization factor based on
// data type rather than just register size. For example, x86 AVX has
// 256-bit registers, but it does not support integer operations
// at that width (that requires AVX2).
if (MaxVectorRegSizeOption.getNumOccurrences())
MaxVecRegSize = MaxVectorRegSizeOption;
else
MaxVecRegSize = TTI->getRegisterBitWidth(true);
MinVecRegSize = MinVectorRegSizeOption;
}
/// \brief Vectorize the tree that starts with the elements in \p VL.
/// Returns the vectorized root.
Value *vectorizeTree();
/// \returns the cost incurred by unwanted spills and fills, caused by
/// holding live values over call sites.
int getSpillCost();
/// \returns the vectorization cost of the subtree that starts at \p VL.
/// A negative number means that this is profitable.
int getTreeCost();
/// Construct a vectorizable tree that starts at \p Roots, ignoring users for
/// the purpose of scheduling and extraction in the \p UserIgnoreLst.
void buildTree(ArrayRef<Value *> Roots,
ArrayRef<Value *> UserIgnoreLst = None);
/// Clear the internal data structures that are created by 'buildTree'.
void deleteTree() {
VectorizableTree.clear();
ScalarToTreeEntry.clear();
MustGather.clear();
ExternalUses.clear();
NumLoadsWantToKeepOrder = 0;
NumLoadsWantToChangeOrder = 0;
for (auto &Iter : BlocksSchedules) {
BlockScheduling *BS = Iter.second.get();
BS->clear();
}
MinBWs.clear();
}
/// \brief Perform LICM and CSE on the newly generated gather sequences.
void optimizeGatherSequence();
/// \returns true if it is beneficial to reverse the vector order.
bool shouldReorder() const {
return NumLoadsWantToChangeOrder > NumLoadsWantToKeepOrder;
}
/// \return The vector element size in bits to use when vectorizing the
/// expression tree ending at \p V. If V is a store, the size is the width of
/// the stored value. Otherwise, the size is the width of the largest loaded
/// value reaching V. This method is used by the vectorizer to calculate
/// vectorization factors.
unsigned getVectorElementSize(Value *V);
/// Compute the minimum type sizes required to represent the entries in a
/// vectorizable tree.
void computeMinimumValueSizes();
// \returns maximum vector register size as set by TTI or overridden by cl::opt.
unsigned getMaxVecRegSize() const {
return MaxVecRegSize;
}
// \returns minimum vector register size as set by cl::opt.
unsigned getMinVecRegSize() const {
return MinVecRegSize;
}
/// \brief Check if ArrayType or StructType is isomorphic to some VectorType.
///
/// \returns number of elements in vector if isomorphism exists, 0 otherwise.
unsigned canMapToVector(Type *T, const DataLayout &DL) const;
/// \returns True if the VectorizableTree is both tiny and not fully
/// vectorizable. We do not vectorize such trees.
bool isTreeTinyAndNotFullyVectorizable();
private:
struct TreeEntry;
/// \returns the cost of the vectorizable entry.
int getEntryCost(TreeEntry *E);
/// This is the recursive part of buildTree.
void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth);
/// \returns True if the ExtractElement/ExtractValue instructions in VL can
/// be vectorized to use the original vector (or aggregate "bitcast" to a vector).
bool canReuseExtract(ArrayRef<Value *> VL, unsigned Opcode) const;
/// Vectorize a single entry in the tree.
Value *vectorizeTree(TreeEntry *E);
/// Vectorize a single entry in the tree, starting in \p VL.
Value *vectorizeTree(ArrayRef<Value *> VL);
/// \returns the pointer to the vectorized value if \p VL is already
/// vectorized, or NULL. They may happen in cycles.
Value *alreadyVectorized(ArrayRef<Value *> VL) const;
/// \returns the scalarization cost for this type. Scalarization in this
/// context means the creation of vectors from a group of scalars.
int getGatherCost(Type *Ty);
/// \returns the scalarization cost for this list of values. Assuming that
/// this subtree gets vectorized, we may need to extract the values from the
/// roots. This method calculates the cost of extracting the values.
int getGatherCost(ArrayRef<Value *> VL);
/// \brief Set the Builder insert point to one after the last instruction in
/// the bundle
void setInsertPointAfterBundle(ArrayRef<Value *> VL);
/// \returns a vector from a collection of scalars in \p VL.
Value *Gather(ArrayRef<Value *> VL, VectorType *Ty);
/// \returns whether the VectorizableTree is fully vectorizable and will
/// be beneficial even the tree height is tiny.
bool isFullyVectorizableTinyTree();
/// \reorder commutative operands in alt shuffle if they result in
/// vectorized code.
void reorderAltShuffleOperands(ArrayRef<Value *> VL,
SmallVectorImpl<Value *> &Left,
SmallVectorImpl<Value *> &Right);
/// \reorder commutative operands to get better probability of
/// generating vectorized code.
void reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
SmallVectorImpl<Value *> &Left,
SmallVectorImpl<Value *> &Right);
struct TreeEntry {
TreeEntry() : Scalars(), VectorizedValue(nullptr),
NeedToGather(0) {}
/// \returns true if the scalars in VL are equal to this entry.
bool isSame(ArrayRef<Value *> VL) const {
assert(VL.size() == Scalars.size() && "Invalid size");
return std::equal(VL.begin(), VL.end(), Scalars.begin());
}
/// A vector of scalars.
ValueList Scalars;
/// The Scalars are vectorized into this value. It is initialized to Null.
Value *VectorizedValue;
/// Do we need to gather this sequence ?
bool NeedToGather;
};
/// Create a new VectorizableTree entry.
TreeEntry *newTreeEntry(ArrayRef<Value *> VL, bool Vectorized) {
VectorizableTree.emplace_back();
int idx = VectorizableTree.size() - 1;
TreeEntry *Last = &VectorizableTree[idx];
Last->Scalars.insert(Last->Scalars.begin(), VL.begin(), VL.end());
Last->NeedToGather = !Vectorized;
if (Vectorized) {
for (int i = 0, e = VL.size(); i != e; ++i) {
assert(!ScalarToTreeEntry.count(VL[i]) && "Scalar already in tree!");
ScalarToTreeEntry[VL[i]] = idx;
}
} else {
MustGather.insert(VL.begin(), VL.end());
}
return Last;
}
/// -- Vectorization State --
/// Holds all of the tree entries.
std::vector<TreeEntry> VectorizableTree;
/// Maps a specific scalar to its tree entry.
SmallDenseMap<Value*, int> ScalarToTreeEntry;
/// A list of scalars that we found that we need to keep as scalars.
ValueSet MustGather;
/// This POD struct describes one external user in the vectorized tree.
struct ExternalUser {
ExternalUser (Value *S, llvm::User *U, int L) :
Scalar(S), User(U), Lane(L){}
// Which scalar in our function.
Value *Scalar;
// Which user that uses the scalar.
llvm::User *User;
// Which lane does the scalar belong to.
int Lane;
};
typedef SmallVector<ExternalUser, 16> UserList;
/// Checks if two instructions may access the same memory.
///
/// \p Loc1 is the location of \p Inst1. It is passed explicitly because it
/// is invariant in the calling loop.
bool isAliased(const MemoryLocation &Loc1, Instruction *Inst1,
Instruction *Inst2) {
// First check if the result is already in the cache.
AliasCacheKey key = std::make_pair(Inst1, Inst2);
Optional<bool> &result = AliasCache[key];
if (result.hasValue()) {
return result.getValue();
}
MemoryLocation Loc2 = getLocation(Inst2, AA);
bool aliased = true;
if (Loc1.Ptr && Loc2.Ptr && isSimple(Inst1) && isSimple(Inst2)) {
// Do the alias check.
aliased = AA->alias(Loc1, Loc2);
}
// Store the result in the cache.
result = aliased;
return aliased;
}
typedef std::pair<Instruction *, Instruction *> AliasCacheKey;
/// Cache for alias results.
/// TODO: consider moving this to the AliasAnalysis itself.
DenseMap<AliasCacheKey, Optional<bool>> AliasCache;
/// Removes an instruction from its block and eventually deletes it.
/// It's like Instruction::eraseFromParent() except that the actual deletion
/// is delayed until BoUpSLP is destructed.
/// This is required to ensure that there are no incorrect collisions in the
/// AliasCache, which can happen if a new instruction is allocated at the
/// same address as a previously deleted instruction.
void eraseInstruction(Instruction *I) {
I->removeFromParent();
I->dropAllReferences();
DeletedInstructions.push_back(std::unique_ptr<Instruction>(I));
}
/// Temporary store for deleted instructions. Instructions will be deleted
/// eventually when the BoUpSLP is destructed.
SmallVector<std::unique_ptr<Instruction>, 8> DeletedInstructions;
/// A list of values that need to extracted out of the tree.
/// This list holds pairs of (Internal Scalar : External User).
UserList ExternalUses;
/// Values used only by @llvm.assume calls.
SmallPtrSet<const Value *, 32> EphValues;
/// Holds all of the instructions that we gathered.
SetVector<Instruction *> GatherSeq;
/// A list of blocks that we are going to CSE.
SetVector<BasicBlock *> CSEBlocks;
/// Contains all scheduling relevant data for an instruction.
/// A ScheduleData either represents a single instruction or a member of an
/// instruction bundle (= a group of instructions which is combined into a
/// vector instruction).
struct ScheduleData {
// The initial value for the dependency counters. It means that the
// dependencies are not calculated yet.
enum { InvalidDeps = -1 };
ScheduleData()
: Inst(nullptr), FirstInBundle(nullptr), NextInBundle(nullptr),
NextLoadStore(nullptr), SchedulingRegionID(0), SchedulingPriority(0),
Dependencies(InvalidDeps), UnscheduledDeps(InvalidDeps),
UnscheduledDepsInBundle(InvalidDeps), IsScheduled(false) {}
void init(int BlockSchedulingRegionID) {
FirstInBundle = this;
NextInBundle = nullptr;
NextLoadStore = nullptr;
IsScheduled = false;
SchedulingRegionID = BlockSchedulingRegionID;
UnscheduledDepsInBundle = UnscheduledDeps;
clearDependencies();
}
/// Returns true if the dependency information has been calculated.
bool hasValidDependencies() const { return Dependencies != InvalidDeps; }
/// Returns true for single instructions and for bundle representatives
/// (= the head of a bundle).
bool isSchedulingEntity() const { return FirstInBundle == this; }
/// Returns true if it represents an instruction bundle and not only a
/// single instruction.
bool isPartOfBundle() const {
return NextInBundle != nullptr || FirstInBundle != this;
}
/// Returns true if it is ready for scheduling, i.e. it has no more
/// unscheduled depending instructions/bundles.
bool isReady() const {
assert(isSchedulingEntity() &&
"can't consider non-scheduling entity for ready list");
return UnscheduledDepsInBundle == 0 && !IsScheduled;
}
/// Modifies the number of unscheduled dependencies, also updating it for
/// the whole bundle.
int incrementUnscheduledDeps(int Incr) {
UnscheduledDeps += Incr;
return FirstInBundle->UnscheduledDepsInBundle += Incr;
}
/// Sets the number of unscheduled dependencies to the number of
/// dependencies.
void resetUnscheduledDeps() {
incrementUnscheduledDeps(Dependencies - UnscheduledDeps);
}
/// Clears all dependency information.
void clearDependencies() {
Dependencies = InvalidDeps;
resetUnscheduledDeps();
MemoryDependencies.clear();
}
void dump(raw_ostream &os) const {
if (!isSchedulingEntity()) {
os << "/ " << *Inst;
} else if (NextInBundle) {
os << '[' << *Inst;
ScheduleData *SD = NextInBundle;
while (SD) {
os << ';' << *SD->Inst;
SD = SD->NextInBundle;
}
os << ']';
} else {
os << *Inst;
}
}
Instruction *Inst;
/// Points to the head in an instruction bundle (and always to this for
/// single instructions).
ScheduleData *FirstInBundle;
/// Single linked list of all instructions in a bundle. Null if it is a
/// single instruction.
ScheduleData *NextInBundle;
/// Single linked list of all memory instructions (e.g. load, store, call)
/// in the block - until the end of the scheduling region.
ScheduleData *NextLoadStore;
/// The dependent memory instructions.
/// This list is derived on demand in calculateDependencies().
SmallVector<ScheduleData *, 4> MemoryDependencies;
/// This ScheduleData is in the current scheduling region if this matches
/// the current SchedulingRegionID of BlockScheduling.
int SchedulingRegionID;
/// Used for getting a "good" final ordering of instructions.
int SchedulingPriority;
/// The number of dependencies. Constitutes of the number of users of the
/// instruction plus the number of dependent memory instructions (if any).
/// This value is calculated on demand.
/// If InvalidDeps, the number of dependencies is not calculated yet.
///
int Dependencies;
/// The number of dependencies minus the number of dependencies of scheduled
/// instructions. As soon as this is zero, the instruction/bundle gets ready
/// for scheduling.
/// Note that this is negative as long as Dependencies is not calculated.
int UnscheduledDeps;
/// The sum of UnscheduledDeps in a bundle. Equals to UnscheduledDeps for
/// single instructions.
int UnscheduledDepsInBundle;
/// True if this instruction is scheduled (or considered as scheduled in the
/// dry-run).
bool IsScheduled;
};
#ifndef NDEBUG
friend inline raw_ostream &operator<<(raw_ostream &os,
const BoUpSLP::ScheduleData &SD) {
SD.dump(os);
return os;
}
#endif
/// Contains all scheduling data for a basic block.
///
struct BlockScheduling {
BlockScheduling(BasicBlock *BB)
: BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize),
ScheduleStart(nullptr), ScheduleEnd(nullptr),
FirstLoadStoreInRegion(nullptr), LastLoadStoreInRegion(nullptr),
ScheduleRegionSize(0),
ScheduleRegionSizeLimit(ScheduleRegionSizeBudget),
// Make sure that the initial SchedulingRegionID is greater than the
// initial SchedulingRegionID in ScheduleData (which is 0).
SchedulingRegionID(1) {}
void clear() {
ReadyInsts.clear();
ScheduleStart = nullptr;
ScheduleEnd = nullptr;
FirstLoadStoreInRegion = nullptr;
LastLoadStoreInRegion = nullptr;
// Reduce the maximum schedule region size by the size of the
// previous scheduling run.
ScheduleRegionSizeLimit -= ScheduleRegionSize;
if (ScheduleRegionSizeLimit < MinScheduleRegionSize)
ScheduleRegionSizeLimit = MinScheduleRegionSize;
ScheduleRegionSize = 0;
// Make a new scheduling region, i.e. all existing ScheduleData is not
// in the new region yet.
++SchedulingRegionID;
}
ScheduleData *getScheduleData(Value *V) {
ScheduleData *SD = ScheduleDataMap[V];
if (SD && SD->SchedulingRegionID == SchedulingRegionID)
return SD;
return nullptr;
}
bool isInSchedulingRegion(ScheduleData *SD) {
return SD->SchedulingRegionID == SchedulingRegionID;
}
/// Marks an instruction as scheduled and puts all dependent ready
/// instructions into the ready-list.
template <typename ReadyListType>
void schedule(ScheduleData *SD, ReadyListType &ReadyList) {
SD->IsScheduled = true;
DEBUG(dbgs() << "SLP: schedule " << *SD << "\n");
ScheduleData *BundleMember = SD;
while (BundleMember) {
// Handle the def-use chain dependencies.
for (Use &U : BundleMember->Inst->operands()) {
ScheduleData *OpDef = getScheduleData(U.get());
if (OpDef && OpDef->hasValidDependencies() &&
OpDef->incrementUnscheduledDeps(-1) == 0) {
// There are no more unscheduled dependencies after decrementing,
// so we can put the dependent instruction into the ready list.
ScheduleData *DepBundle = OpDef->FirstInBundle;
assert(!DepBundle->IsScheduled &&
"already scheduled bundle gets ready");
ReadyList.insert(DepBundle);
DEBUG(dbgs() << "SLP: gets ready (def): " << *DepBundle << "\n");
}
}
// Handle the memory dependencies.
for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) {
if (MemoryDepSD->incrementUnscheduledDeps(-1) == 0) {
// There are no more unscheduled dependencies after decrementing,
// so we can put the dependent instruction into the ready list.
ScheduleData *DepBundle = MemoryDepSD->FirstInBundle;
assert(!DepBundle->IsScheduled &&
"already scheduled bundle gets ready");
ReadyList.insert(DepBundle);
DEBUG(dbgs() << "SLP: gets ready (mem): " << *DepBundle << "\n");
}
}
BundleMember = BundleMember->NextInBundle;
}
}
/// Put all instructions into the ReadyList which are ready for scheduling.
template <typename ReadyListType>
void initialFillReadyList(ReadyListType &ReadyList) {
for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
ScheduleData *SD = getScheduleData(I);
if (SD->isSchedulingEntity() && SD->isReady()) {
ReadyList.insert(SD);
DEBUG(dbgs() << "SLP: initially in ready list: " << *I << "\n");
}
}
}
/// Checks if a bundle of instructions can be scheduled, i.e. has no
/// cyclic dependencies. This is only a dry-run, no instructions are
/// actually moved at this stage.
bool tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP);
/// Un-bundles a group of instructions.
void cancelScheduling(ArrayRef<Value *> VL);
/// Extends the scheduling region so that V is inside the region.
/// \returns true if the region size is within the limit.
bool extendSchedulingRegion(Value *V);
/// Initialize the ScheduleData structures for new instructions in the
/// scheduling region.
void initScheduleData(Instruction *FromI, Instruction *ToI,
ScheduleData *PrevLoadStore,
ScheduleData *NextLoadStore);
/// Updates the dependency information of a bundle and of all instructions/
/// bundles which depend on the original bundle.
void calculateDependencies(ScheduleData *SD, bool InsertInReadyList,
BoUpSLP *SLP);
/// Sets all instruction in the scheduling region to un-scheduled.
void resetSchedule();
BasicBlock *BB;
/// Simple memory allocation for ScheduleData.
std::vector<std::unique_ptr<ScheduleData[]>> ScheduleDataChunks;
/// The size of a ScheduleData array in ScheduleDataChunks.
int ChunkSize;
/// The allocator position in the current chunk, which is the last entry
/// of ScheduleDataChunks.
int ChunkPos;
/// Attaches ScheduleData to Instruction.
/// Note that the mapping survives during all vectorization iterations, i.e.
/// ScheduleData structures are recycled.
DenseMap<Value *, ScheduleData *> ScheduleDataMap;
struct ReadyList : SmallVector<ScheduleData *, 8> {
void insert(ScheduleData *SD) { push_back(SD); }
};
/// The ready-list for scheduling (only used for the dry-run).
ReadyList ReadyInsts;
/// The first instruction of the scheduling region.
Instruction *ScheduleStart;
/// The first instruction _after_ the scheduling region.
Instruction *ScheduleEnd;
/// The first memory accessing instruction in the scheduling region
/// (can be null).
ScheduleData *FirstLoadStoreInRegion;
/// The last memory accessing instruction in the scheduling region
/// (can be null).
ScheduleData *LastLoadStoreInRegion;
/// The current size of the scheduling region.
int ScheduleRegionSize;
/// The maximum size allowed for the scheduling region.
int ScheduleRegionSizeLimit;
/// The ID of the scheduling region. For a new vectorization iteration this
/// is incremented which "removes" all ScheduleData from the region.
int SchedulingRegionID;
};
/// Attaches the BlockScheduling structures to basic blocks.
MapVector<BasicBlock *, std::unique_ptr<BlockScheduling>> BlocksSchedules;
/// Performs the "real" scheduling. Done before vectorization is actually
/// performed in a basic block.
void scheduleBlock(BlockScheduling *BS);
/// List of users to ignore during scheduling and that don't need extracting.
ArrayRef<Value *> UserIgnoreList;
// Number of load bundles that contain consecutive loads.
int NumLoadsWantToKeepOrder;
// Number of load bundles that contain consecutive loads in reversed order.
int NumLoadsWantToChangeOrder;
// Analysis and block reference.
Function *F;
ScalarEvolution *SE;
TargetTransformInfo *TTI;
TargetLibraryInfo *TLI;
AliasAnalysis *AA;
LoopInfo *LI;
DominatorTree *DT;
AssumptionCache *AC;
DemandedBits *DB;
const DataLayout *DL;
unsigned MaxVecRegSize; // This is set by TTI or overridden by cl::opt.
unsigned MinVecRegSize; // Set by cl::opt (default: 128).
/// Instruction builder to construct the vectorized tree.
IRBuilder<> Builder;
/// A map of scalar integer values to the smallest bit width with which they
/// can legally be represented. The values map to (width, signed) pairs,
/// where "width" indicates the minimum bit width and "signed" is True if the
/// value must be signed-extended, rather than zero-extended, back to its
/// original width.
MapVector<Value *, std::pair<uint64_t, bool>> MinBWs;
};
} // end namespace llvm
} // end namespace slpvectorizer
void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
ArrayRef<Value *> UserIgnoreLst) {
deleteTree();
UserIgnoreList = UserIgnoreLst;
if (!allSameType(Roots))
return;
buildTree_rec(Roots, 0);
// Collect the values that we need to extract from the tree.
for (TreeEntry &EIdx : VectorizableTree) {
TreeEntry *Entry = &EIdx;
// For each lane:
for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
Value *Scalar = Entry->Scalars[Lane];
// No need to handle users of gathered values.
if (Entry->NeedToGather)
continue;
for (User *U : Scalar->users()) {
DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n");
Instruction *UserInst = dyn_cast<Instruction>(U);
if (!UserInst)
continue;
// Skip in-tree scalars that become vectors
if (ScalarToTreeEntry.count(U)) {
int Idx = ScalarToTreeEntry[U];
TreeEntry *UseEntry = &VectorizableTree[Idx];
Value *UseScalar = UseEntry->Scalars[0];
// Some in-tree scalars will remain as scalar in vectorized
// instructions. If that is the case, the one in Lane 0 will
// be used.
if (UseScalar != U ||
!InTreeUserNeedToExtract(Scalar, UserInst, TLI)) {
DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U
<< ".\n");
assert(!VectorizableTree[Idx].NeedToGather && "Bad state");
continue;
}
}
// Ignore users in the user ignore list.
if (is_contained(UserIgnoreList, UserInst))
continue;
DEBUG(dbgs() << "SLP: Need to extract:" << *U << " from lane " <<
Lane << " from " << *Scalar << ".\n");
ExternalUses.push_back(ExternalUser(Scalar, U, Lane));
}
}
}
}
void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
bool isAltShuffle = false;
assert((allConstant(VL) || allSameType(VL)) && "Invalid types!");
if (Depth == RecursionMaxDepth) {
DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n");
newTreeEntry(VL, false);
return;
}
// Don't handle vectors.
if (VL[0]->getType()->isVectorTy()) {
DEBUG(dbgs() << "SLP: Gathering due to vector type.\n");
newTreeEntry(VL, false);
return;
}
if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
if (SI->getValueOperand()->getType()->isVectorTy()) {
DEBUG(dbgs() << "SLP: Gathering due to store vector type.\n");
newTreeEntry(VL, false);
return;
}
unsigned Opcode = getSameOpcode(VL);
// Check that this shuffle vector refers to the alternate
// sequence of opcodes.
if (Opcode == Instruction::ShuffleVector) {
Instruction *I0 = dyn_cast<Instruction>(VL[0]);
unsigned Op = I0->getOpcode();
if (Op != Instruction::ShuffleVector)
isAltShuffle = true;
}
// If all of the operands are identical or constant we have a simple solution.
if (allConstant(VL) || isSplat(VL) || !allSameBlock(VL) || !Opcode) {
DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n");
newTreeEntry(VL, false);
return;
}
// We now know that this is a vector of instructions of the same type from
// the same block.
// Don't vectorize ephemeral values.
for (unsigned i = 0, e = VL.size(); i != e; ++i) {
if (EphValues.count(VL[i])) {
DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] <<
") is ephemeral.\n");
newTreeEntry(VL, false);
return;
}
}
// Check if this is a duplicate of another entry.
if (ScalarToTreeEntry.count(VL[0])) {
int Idx = ScalarToTreeEntry[VL[0]];
TreeEntry *E = &VectorizableTree[Idx];
for (unsigned i = 0, e = VL.size(); i != e; ++i) {
DEBUG(dbgs() << "SLP: \tChecking bundle: " << *VL[i] << ".\n");
if (E->Scalars[i] != VL[i]) {
DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n");
newTreeEntry(VL, false);
return;
}
}
DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *VL[0] << ".\n");
return;
}
// Check that none of the instructions in the bundle are already in the tree.
for (unsigned i = 0, e = VL.size(); i != e; ++i) {
if (ScalarToTreeEntry.count(VL[i])) {
DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] <<
") is already in tree.\n");
newTreeEntry(VL, false);
return;
}
}
// If any of the scalars is marked as a value that needs to stay scalar then
// we need to gather the scalars.
for (unsigned i = 0, e = VL.size(); i != e; ++i) {
if (MustGather.count(VL[i])) {
DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n");
newTreeEntry(VL, false);
return;
}
}
// Check that all of the users of the scalars that we want to vectorize are
// schedulable.
Instruction *VL0 = cast<Instruction>(VL[0]);
BasicBlock *BB = cast<Instruction>(VL0)->getParent();
if (!DT->isReachableFromEntry(BB)) {
// Don't go into unreachable blocks. They may contain instructions with
// dependency cycles which confuse the final scheduling.
DEBUG(dbgs() << "SLP: bundle in unreachable block.\n");
newTreeEntry(VL, false);
return;
}
// Check that every instructions appears once in this bundle.
for (unsigned i = 0, e = VL.size(); i < e; ++i)
for (unsigned j = i+1; j < e; ++j)
if (VL[i] == VL[j]) {
DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
newTreeEntry(VL, false);
return;
}
auto &BSRef = BlocksSchedules[BB];
if (!BSRef) {
BSRef = llvm::make_unique<BlockScheduling>(BB);
}
BlockScheduling &BS = *BSRef.get();
if (!BS.tryScheduleBundle(VL, this)) {
DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n");
assert((!BS.getScheduleData(VL[0]) ||
!BS.getScheduleData(VL[0])->isPartOfBundle()) &&
"tryScheduleBundle should cancelScheduling on failure");
newTreeEntry(VL, false);
return;
}
DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n");
switch (Opcode) {
case Instruction::PHI: {
PHINode *PH = dyn_cast<PHINode>(VL0);
// Check for terminator values (e.g. invoke).
for (unsigned j = 0; j < VL.size(); ++j)
for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
TerminatorInst *Term = dyn_cast<TerminatorInst>(
cast<PHINode>(VL[j])->getIncomingValueForBlock(PH->getIncomingBlock(i)));
if (Term) {
DEBUG(dbgs() << "SLP: Need to swizzle PHINodes (TerminatorInst use).\n");
BS.cancelScheduling(VL);
newTreeEntry(VL, false);
return;
}
}
newTreeEntry(VL, true);
DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n");
for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
ValueList Operands;
// Prepare the operand vector.
for (Value *j : VL)
Operands.push_back(cast<PHINode>(j)->getIncomingValueForBlock(
PH->getIncomingBlock(i)));
buildTree_rec(Operands, Depth + 1);
}
return;
}
case Instruction::ExtractValue:
case Instruction::ExtractElement: {
bool Reuse = canReuseExtract(VL, Opcode);
if (Reuse) {
DEBUG(dbgs() << "SLP: Reusing extract sequence.\n");
} else {
BS.cancelScheduling(VL);
}
newTreeEntry(VL, Reuse);
return;
}
case Instruction::Load: {
// Check that a vectorized load would load the same memory as a scalar
// load.
// For example we don't want vectorize loads that are smaller than 8 bit.
// Even though we have a packed struct {<i2, i2, i2, i2>} LLVM treats
// loading/storing it as an i8 struct. If we vectorize loads/stores from
// such a struct we read/write packed bits disagreeing with the
// unvectorized version.
Type *ScalarTy = VL[0]->getType();
if (DL->getTypeSizeInBits(ScalarTy) !=
DL->getTypeAllocSizeInBits(ScalarTy)) {
BS.cancelScheduling(VL);
newTreeEntry(VL, false);
DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n");
return;
}
// Make sure all loads in the bundle are simple - we can't vectorize
// atomic or volatile loads.
for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) {
LoadInst *L = cast<LoadInst>(VL[i]);
if (!L->isSimple()) {
BS.cancelScheduling(VL);
newTreeEntry(VL, false);
DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n");
return;
}
}
// Check if the loads are consecutive, reversed, or neither.
// TODO: What we really want is to sort the loads, but for now, check
// the two likely directions.
bool Consecutive = true;
bool ReverseConsecutive = true;
for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) {
if (!isConsecutiveAccess(VL[i], VL[i + 1], *DL, *SE)) {
Consecutive = false;
break;
} else {
ReverseConsecutive = false;
}
}
if (Consecutive) {
++NumLoadsWantToKeepOrder;
newTreeEntry(VL, true);
DEBUG(dbgs() << "SLP: added a vector of loads.\n");
return;
}
// If none of the load pairs were consecutive when checked in order,
// check the reverse order.
if (ReverseConsecutive)
for (unsigned i = VL.size() - 1; i > 0; --i)
if (!isConsecutiveAccess(VL[i], VL[i - 1], *DL, *SE)) {
ReverseConsecutive = false;
break;
}
BS.cancelScheduling(VL);
newTreeEntry(VL, false);
if (ReverseConsecutive) {
++NumLoadsWantToChangeOrder;
DEBUG(dbgs() << "SLP: Gathering reversed loads.\n");
} else {
DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n");
}
return;
}
case Instruction::ZExt:
case Instruction::SExt:
case Instruction::FPToUI:
case Instruction::FPToSI:
case Instruction::FPExt:
case Instruction::PtrToInt:
case Instruction::IntToPtr:
case Instruction::SIToFP:
case Instruction::UIToFP:
case Instruction::Trunc:
case Instruction::FPTrunc:
case Instruction::BitCast: {
Type *SrcTy = VL0->getOperand(0)->getType();
for (unsigned i = 0; i < VL.size(); ++i) {
Type *Ty = cast<Instruction>(VL[i])->getOperand(0)->getType();
if (Ty != SrcTy || !isValidElementType(Ty)) {
BS.cancelScheduling(VL);
newTreeEntry(VL, false);
DEBUG(dbgs() << "SLP: Gathering casts with different src types.\n");
return;
}
}
newTreeEntry(VL, true);
DEBUG(dbgs() << "SLP: added a vector of casts.\n");
for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
ValueList Operands;
// Prepare the operand vector.
for (Value *j : VL)
Operands.push_back(cast<Instruction>(j)->getOperand(i));
buildTree_rec(Operands, Depth+1);
}
return;
}
case Instruction::ICmp:
case Instruction::FCmp: {
// Check that all of the compares have the same predicate.
CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
Type *ComparedTy = cast<Instruction>(VL[0])->getOperand(0)->getType();
for (unsigned i = 1, e = VL.size(); i < e; ++i) {
CmpInst *Cmp = cast<CmpInst>(VL[i]);
if (Cmp->getPredicate() != P0 ||
Cmp->getOperand(0)->getType() != ComparedTy) {
BS.cancelScheduling(VL);
newTreeEntry(VL, false);
DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n");
return;
}
}
newTreeEntry(VL, true);
DEBUG(dbgs() << "SLP: added a vector of compares.\n");
for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
ValueList Operands;
// Prepare the operand vector.
for (Value *j : VL)
Operands.push_back(cast<Instruction>(j)->getOperand(i));
buildTree_rec(Operands, Depth+1);
}
return;
}
case Instruction::Select:
case Instruction::Add:
case Instruction::FAdd:
case Instruction::Sub:
case Instruction::FSub:
case Instruction::Mul:
case Instruction::FMul:
case Instruction::UDiv:
case Instruction::SDiv:
case Instruction::FDiv:
case Instruction::URem:
case Instruction::SRem:
case Instruction::FRem:
case Instruction::Shl:
case Instruction::LShr:
case Instruction::AShr:
case Instruction::And:
case Instruction::Or:
case Instruction::Xor: {
newTreeEntry(VL, true);
DEBUG(dbgs() << "SLP: added a vector of bin op.\n");
// Sort operands of the instructions so that each side is more likely to
// have the same opcode.
if (isa<BinaryOperator>(VL0) && VL0->isCommutative()) {
ValueList Left, Right;
reorderInputsAccordingToOpcode(VL, Left, Right);
buildTree_rec(Left, Depth + 1);
buildTree_rec(Right, Depth + 1);
return;
}
for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
ValueList Operands;
// Prepare the operand vector.
for (Value *j : VL)
Operands.push_back(cast<Instruction>(j)->getOperand(i));
buildTree_rec(Operands, Depth+1);
}
return;
}
case Instruction::GetElementPtr: {
// We don't combine GEPs with complicated (nested) indexing.
for (unsigned j = 0; j < VL.size(); ++j) {
if (cast<Instruction>(VL[j])->getNumOperands() != 2) {
DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n");
BS.cancelScheduling(VL);
newTreeEntry(VL, false);
return;
}
}
// We can't combine several GEPs into one vector if they operate on
// different types.
Type *Ty0 = cast<Instruction>(VL0)->getOperand(0)->getType();
for (unsigned j = 0; j < VL.size(); ++j) {
Type *CurTy = cast<Instruction>(VL[j])->getOperand(0)->getType();
if (Ty0 != CurTy) {
DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n");
BS.cancelScheduling(VL);
newTreeEntry(VL, false);
return;
}
}
// We don't combine GEPs with non-constant indexes.
for (unsigned j = 0; j < VL.size(); ++j) {
auto Op = cast<Instruction>(VL[j])->getOperand(1);
if (!isa<ConstantInt>(Op)) {
DEBUG(
dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n");
BS.cancelScheduling(VL);
newTreeEntry(VL, false);
return;
}
}
newTreeEntry(VL, true);
DEBUG(dbgs() << "SLP: added a vector of GEPs.\n");
for (unsigned i = 0, e = 2; i < e; ++i) {
ValueList Operands;
// Prepare the operand vector.
for (Value *j : VL)
Operands.push_back(cast<Instruction>(j)->getOperand(i));
buildTree_rec(Operands, Depth + 1);
}
return;
}
case Instruction::Store: {
// Check if the stores are consecutive or of we need to swizzle them.
for (unsigned i = 0, e = VL.size() - 1; i < e; ++i)
if (!isConsecutiveAccess(VL[i], VL[i + 1], *DL, *SE)) {
BS.cancelScheduling(VL);
newTreeEntry(VL, false);
DEBUG(dbgs() << "SLP: Non-consecutive store.\n");
return;
}
newTreeEntry(VL, true);
DEBUG(dbgs() << "SLP: added a vector of stores.\n");
ValueList Operands;
for (Value *j : VL)
Operands.push_back(cast<Instruction>(j)->getOperand(0));
buildTree_rec(Operands, Depth + 1);
return;
}
case Instruction::Call: {
// Check if the calls are all to the same vectorizable intrinsic.
CallInst *CI = cast<CallInst>(VL[0]);
// Check if this is an Intrinsic call or something that can be
// represented by an intrinsic call
Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
if (!isTriviallyVectorizable(ID)) {
BS.cancelScheduling(VL);
newTreeEntry(VL, false);
DEBUG(dbgs() << "SLP: Non-vectorizable call.\n");
return;
}
Function *Int = CI->getCalledFunction();
Value *A1I = nullptr;
if (hasVectorInstrinsicScalarOpd(ID, 1))
A1I = CI->getArgOperand(1);
for (unsigned i = 1, e = VL.size(); i != e; ++i) {
CallInst *CI2 = dyn_cast<CallInst>(VL[i]);
if (!CI2 || CI2->getCalledFunction() != Int ||
getVectorIntrinsicIDForCall(CI2, TLI) != ID ||
!CI->hasIdenticalOperandBundleSchema(*CI2)) {
BS.cancelScheduling(VL);
newTreeEntry(VL, false);
DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *VL[i]
<< "\n");
return;
}
// ctlz,cttz and powi are special intrinsics whose second argument
// should be same in order for them to be vectorized.
if (hasVectorInstrinsicScalarOpd(ID, 1)) {
Value *A1J = CI2->getArgOperand(1);
if (A1I != A1J) {
BS.cancelScheduling(VL);
newTreeEntry(VL, false);
DEBUG(dbgs() << "SLP: mismatched arguments in call:" << *CI
<< " argument "<< A1I<<"!=" << A1J
<< "\n");
return;
}
}
// Verify that the bundle operands are identical between the two calls.
if (CI->hasOperandBundles() &&
!std::equal(CI->op_begin() + CI->getBundleOperandsStartIndex(),
CI->op_begin() + CI->getBundleOperandsEndIndex(),
CI2->op_begin() + CI2->getBundleOperandsStartIndex())) {
BS.cancelScheduling(VL);
newTreeEntry(VL, false);
DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:" << *CI << "!="
<< *VL[i] << '\n');
return;
}
}
newTreeEntry(VL, true);
for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) {
ValueList Operands;
// Prepare the operand vector.
for (Value *j : VL) {
CallInst *CI2 = dyn_cast<CallInst>(j);
Operands.push_back(CI2->getArgOperand(i));
}
buildTree_rec(Operands, Depth + 1);
}
return;
}
case Instruction::ShuffleVector: {
// If this is not an alternate sequence of opcode like add-sub
// then do not vectorize this instruction.
if (!isAltShuffle) {
BS.cancelScheduling(VL);
newTreeEntry(VL, false);
DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n");
return;
}
newTreeEntry(VL, true);
DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n");
// Reorder operands if reordering would enable vectorization.
if (isa<BinaryOperator>(VL0)) {
ValueList Left, Right;
reorderAltShuffleOperands(VL, Left, Right);
buildTree_rec(Left, Depth + 1);
buildTree_rec(Right, Depth + 1);
return;
}
for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
ValueList Operands;
// Prepare the operand vector.
for (Value *j : VL)
Operands.push_back(cast<Instruction>(j)->getOperand(i));
buildTree_rec(Operands, Depth + 1);
}
return;
}
default:
BS.cancelScheduling(VL);
newTreeEntry(VL, false);
DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n");
return;
}
}
unsigned BoUpSLP::canMapToVector(Type *T, const DataLayout &DL) const {
unsigned N;
Type *EltTy;
auto *ST = dyn_cast<StructType>(T);
if (ST) {
N = ST->getNumElements();
EltTy = *ST->element_begin();
} else {
N = cast<ArrayType>(T)->getNumElements();
EltTy = cast<ArrayType>(T)->getElementType();
}
if (!isValidElementType(EltTy))
return 0;
uint64_t VTSize = DL.getTypeStoreSizeInBits(VectorType::get(EltTy, N));
if (VTSize < MinVecRegSize || VTSize > MaxVecRegSize || VTSize != DL.getTypeStoreSizeInBits(T))
return 0;
if (ST) {
// Check that struct is homogeneous.
for (const auto *Ty : ST->elements())
if (Ty != EltTy)
return 0;
}
return N;
}
bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, unsigned Opcode) const {
assert(Opcode == Instruction::ExtractElement ||
Opcode == Instruction::ExtractValue);
assert(Opcode == getSameOpcode(VL) && "Invalid opcode");
// Check if all of the extracts come from the same vector and from the
// correct offset.
Value *VL0 = VL[0];
Instruction *E0 = cast<Instruction>(VL0);
Value *Vec = E0->getOperand(0);
// We have to extract from a vector/aggregate with the same number of elements.
unsigned NElts;
if (Opcode == Instruction::ExtractValue) {
const DataLayout &DL = E0->getModule()->getDataLayout();
NElts = canMapToVector(Vec->getType(), DL);
if (!NElts)
return false;
// Check if load can be rewritten as load of vector.
LoadInst *LI = dyn_cast<LoadInst>(Vec);
if (!LI || !LI->isSimple() || !LI->hasNUses(VL.size()))
return false;
} else {
NElts = Vec->getType()->getVectorNumElements();
}
if (NElts != VL.size())
return false;
// Check that all of the indices extract from the correct offset.
if (!matchExtractIndex(E0, 0, Opcode))
return false;
for (unsigned i = 1, e = VL.size(); i < e; ++i) {
Instruction *E = cast<Instruction>(VL[i]);
if (!matchExtractIndex(E, i, Opcode))
return false;
if (E->getOperand(0) != Vec)
return false;
}
return true;
}
int BoUpSLP::getEntryCost(TreeEntry *E) {
ArrayRef<Value*> VL = E->Scalars;
Type *ScalarTy = VL[0]->getType();
if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
ScalarTy = SI->getValueOperand()->getType();
VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
// If we have computed a smaller type for the expression, update VecTy so
// that the costs will be accurate.
if (MinBWs.count(VL[0]))
VecTy = VectorType::get(
IntegerType::get(F->getContext(), MinBWs[VL[0]].first), VL.size());
if (E->NeedToGather) {
if (allConstant(VL))
return 0;
if (isSplat(VL)) {
return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, 0);
}
return getGatherCost(E->Scalars);
}
unsigned Opcode = getSameOpcode(VL);
assert(Opcode && allSameType(VL) && allSameBlock(VL) && "Invalid VL");
Instruction *VL0 = cast<Instruction>(VL[0]);
switch (Opcode) {
case Instruction::PHI: {
return 0;
}
case Instruction::ExtractValue:
case Instruction::ExtractElement: {
if (canReuseExtract(VL, Opcode)) {
int DeadCost = 0;
for (unsigned i = 0, e = VL.size(); i < e; ++i) {
Instruction *E = cast<Instruction>(VL[i]);
if (E->hasOneUse())
// Take credit for instruction that will become dead.
DeadCost +=
TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, i);
}
return -DeadCost;
}
return getGatherCost(VecTy);
}
case Instruction::ZExt:
case Instruction::SExt:
case Instruction::FPToUI:
case Instruction::FPToSI:
case Instruction::FPExt:
case Instruction::PtrToInt:
case Instruction::IntToPtr:
case Instruction::SIToFP:
case Instruction::UIToFP:
case Instruction::Trunc:
case Instruction::FPTrunc:
case Instruction::BitCast: {
Type *SrcTy = VL0->getOperand(0)->getType();
// Calculate the cost of this instruction.
int ScalarCost = VL.size() * TTI->getCastInstrCost(VL0->getOpcode(),
VL0->getType(), SrcTy);
VectorType *SrcVecTy = VectorType::get(SrcTy, VL.size());
int VecCost = TTI->getCastInstrCost(VL0->getOpcode(), VecTy, SrcVecTy);
return VecCost - ScalarCost;
}
case Instruction::FCmp:
case Instruction::ICmp:
case Instruction::Select: {
// Calculate the cost of this instruction.
VectorType *MaskTy = VectorType::get(Builder.getInt1Ty(), VL.size());
int ScalarCost = VecTy->getNumElements() *
TTI->getCmpSelInstrCost(Opcode, ScalarTy, Builder.getInt1Ty());
int VecCost = TTI->getCmpSelInstrCost(Opcode, VecTy, MaskTy);
return VecCost - ScalarCost;
}
case Instruction::Add:
case Instruction::FAdd:
case Instruction::Sub:
case Instruction::FSub:
case Instruction::Mul:
case Instruction::FMul:
case Instruction::UDiv:
case Instruction::SDiv:
case Instruction::FDiv:
case Instruction::URem:
case Instruction::SRem:
case Instruction::FRem:
case Instruction::Shl:
case Instruction::LShr:
case Instruction::AShr:
case Instruction::And:
case Instruction::Or:
case Instruction::Xor: {
// Certain instructions can be cheaper to vectorize if they have a
// constant second vector operand.
TargetTransformInfo::OperandValueKind Op1VK =
TargetTransformInfo::OK_AnyValue;
TargetTransformInfo::OperandValueKind Op2VK =
TargetTransformInfo::OK_UniformConstantValue;
TargetTransformInfo::OperandValueProperties Op1VP =
TargetTransformInfo::OP_None;
TargetTransformInfo::OperandValueProperties Op2VP =
TargetTransformInfo::OP_None;
// If all operands are exactly the same ConstantInt then set the
// operand kind to OK_UniformConstantValue.
// If instead not all operands are constants, then set the operand kind
// to OK_AnyValue. If all operands are constants but not the same,
// then set the operand kind to OK_NonUniformConstantValue.
ConstantInt *CInt = nullptr;
for (unsigned i = 0; i < VL.size(); ++i) {
const Instruction *I = cast<Instruction>(VL[i]);
if (!isa<ConstantInt>(I->getOperand(1))) {
Op2VK = TargetTransformInfo::OK_AnyValue;
break;
}
if (i == 0) {
CInt = cast<ConstantInt>(I->getOperand(1));
continue;
}
if (Op2VK == TargetTransformInfo::OK_UniformConstantValue &&
CInt != cast<ConstantInt>(I->getOperand(1)))
Op2VK = TargetTransformInfo::OK_NonUniformConstantValue;
}
// FIXME: Currently cost of model modification for division by power of
// 2 is handled for X86 and AArch64. Add support for other targets.
if (Op2VK == TargetTransformInfo::OK_UniformConstantValue && CInt &&
CInt->getValue().isPowerOf2())
Op2VP = TargetTransformInfo::OP_PowerOf2;
int ScalarCost = VecTy->getNumElements() *
TTI->getArithmeticInstrCost(Opcode, ScalarTy, Op1VK,
Op2VK, Op1VP, Op2VP);
int VecCost = TTI->getArithmeticInstrCost(Opcode, VecTy, Op1VK, Op2VK,
Op1VP, Op2VP);
return VecCost - ScalarCost;
}
case Instruction::GetElementPtr: {
TargetTransformInfo::OperandValueKind Op1VK =
TargetTransformInfo::OK_AnyValue;
TargetTransformInfo::OperandValueKind Op2VK =
TargetTransformInfo::OK_UniformConstantValue;
int ScalarCost =
VecTy->getNumElements() *
TTI->getArithmeticInstrCost(Instruction::Add, ScalarTy, Op1VK, Op2VK);
int VecCost =
TTI->getArithmeticInstrCost(Instruction::Add, VecTy, Op1VK, Op2VK);
return VecCost - ScalarCost;
}
case Instruction::Load: {
// Cost of wide load - cost of scalar loads.
unsigned alignment = dyn_cast<LoadInst>(VL0)->getAlignment();
int ScalarLdCost = VecTy->getNumElements() *
TTI->getMemoryOpCost(Instruction::Load, ScalarTy, alignment, 0);
int VecLdCost = TTI->getMemoryOpCost(Instruction::Load,
VecTy, alignment, 0);
return VecLdCost - ScalarLdCost;
}
case Instruction::Store: {
// We know that we can merge the stores. Calculate the cost.
unsigned alignment = dyn_cast<StoreInst>(VL0)->getAlignment();
int ScalarStCost = VecTy->getNumElements() *
TTI->getMemoryOpCost(Instruction::Store, ScalarTy, alignment, 0);
int VecStCost = TTI->getMemoryOpCost(Instruction::Store,
VecTy, alignment, 0);
return VecStCost - ScalarStCost;
}
case Instruction::Call: {
CallInst *CI = cast<CallInst>(VL0);
Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
// Calculate the cost of the scalar and vector calls.
SmallVector<Type*, 4> ScalarTys, VecTys;
for (unsigned op = 0, opc = CI->getNumArgOperands(); op!= opc; ++op) {
ScalarTys.push_back(CI->getArgOperand(op)->getType());
VecTys.push_back(VectorType::get(CI->getArgOperand(op)->getType(),
VecTy->getNumElements()));
}
FastMathFlags FMF;
if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
FMF = FPMO->getFastMathFlags();
int ScalarCallCost = VecTy->getNumElements() *
TTI->getIntrinsicInstrCost(ID, ScalarTy, ScalarTys, FMF);
int VecCallCost = TTI->getIntrinsicInstrCost(ID, VecTy, VecTys, FMF);
DEBUG(dbgs() << "SLP: Call cost "<< VecCallCost - ScalarCallCost
<< " (" << VecCallCost << "-" << ScalarCallCost << ")"
<< " for " << *CI << "\n");
return VecCallCost - ScalarCallCost;
}
case Instruction::ShuffleVector: {
TargetTransformInfo::OperandValueKind Op1VK =
TargetTransformInfo::OK_AnyValue;
TargetTransformInfo::OperandValueKind Op2VK =
TargetTransformInfo::OK_AnyValue;
int ScalarCost = 0;
int VecCost = 0;
for (Value *i : VL) {
Instruction *I = cast<Instruction>(i);
if (!I)
break;
ScalarCost +=
TTI->getArithmeticInstrCost(I->getOpcode(), ScalarTy, Op1VK, Op2VK);
}
// VecCost is equal to sum of the cost of creating 2 vectors
// and the cost of creating shuffle.
Instruction *I0 = cast<Instruction>(VL[0]);
VecCost =
TTI->getArithmeticInstrCost(I0->getOpcode(), VecTy, Op1VK, Op2VK);
Instruction *I1 = cast<Instruction>(VL[1]);
VecCost +=
TTI->getArithmeticInstrCost(I1->getOpcode(), VecTy, Op1VK, Op2VK);
VecCost +=
TTI->getShuffleCost(TargetTransformInfo::SK_Alternate, VecTy, 0);
return VecCost - ScalarCost;
}
default:
llvm_unreachable("Unknown instruction");
}
}
bool BoUpSLP::isFullyVectorizableTinyTree() {
DEBUG(dbgs() << "SLP: Check whether the tree with height " <<
VectorizableTree.size() << " is fully vectorizable .\n");
// We only handle trees of heights 1 and 2.
if (VectorizableTree.size() == 1 && !VectorizableTree[0].NeedToGather)
return true;
if (VectorizableTree.size() != 2)
return false;
// Handle splat and all-constants stores.
if (!VectorizableTree[0].NeedToGather &&
(allConstant(VectorizableTree[1].Scalars) ||
isSplat(VectorizableTree[1].Scalars)))
return true;
// Gathering cost would be too much for tiny trees.
if (VectorizableTree[0].NeedToGather || VectorizableTree[1].NeedToGather)
return false;
return true;
}
bool BoUpSLP::isTreeTinyAndNotFullyVectorizable() {
// We can vectorize the tree if its size is greater than or equal to the
// minimum size specified by the MinTreeSize command line option.
if (VectorizableTree.size() >= MinTreeSize)
return false;
// If we have a tiny tree (a tree whose size is less than MinTreeSize), we
// can vectorize it if we can prove it fully vectorizable.
if (isFullyVectorizableTinyTree())
return false;
assert(VectorizableTree.empty()
? ExternalUses.empty()
: true && "We shouldn't have any external users");
// Otherwise, we can't vectorize the tree. It is both tiny and not fully
// vectorizable.
return true;
}
int BoUpSLP::getSpillCost() {
// Walk from the bottom of the tree to the top, tracking which values are
// live. When we see a call instruction that is not part of our tree,
// query TTI to see if there is a cost to keeping values live over it
// (for example, if spills and fills are required).
unsigned BundleWidth = VectorizableTree.front().Scalars.size();
int Cost = 0;
SmallPtrSet<Instruction*, 4> LiveValues;
Instruction *PrevInst = nullptr;
for (const auto &N : VectorizableTree) {
Instruction *Inst = dyn_cast<Instruction>(N.Scalars[0]);
if (!Inst)
continue;
if (!PrevInst) {
PrevInst = Inst;
continue;
}
// Update LiveValues.
LiveValues.erase(PrevInst);
for (auto &J : PrevInst->operands()) {
if (isa<Instruction>(&*J) && ScalarToTreeEntry.count(&*J))
LiveValues.insert(cast<Instruction>(&*J));
}
DEBUG(
dbgs() << "SLP: #LV: " << LiveValues.size();
for (auto *X : LiveValues)
dbgs() << " " << X->getName();
dbgs() << ", Looking at ";
Inst->dump();
);
// Now find the sequence of instructions between PrevInst and Inst.
BasicBlock::reverse_iterator InstIt = ++Inst->getIterator().getReverse(),
PrevInstIt =
PrevInst->getIterator().getReverse();
while (InstIt != PrevInstIt) {
if (PrevInstIt == PrevInst->getParent()->rend()) {
PrevInstIt = Inst->getParent()->rbegin();
continue;
}
if (isa<CallInst>(&*PrevInstIt) && &*PrevInstIt != PrevInst) {
SmallVector<Type*, 4> V;
for (auto *II : LiveValues)
V.push_back(VectorType::get(II->getType(), BundleWidth));
Cost += TTI->getCostOfKeepingLiveOverCall(V);
}
++PrevInstIt;
}
PrevInst = Inst;
}
return Cost;
}
int BoUpSLP::getTreeCost() {
int Cost = 0;
DEBUG(dbgs() << "SLP: Calculating cost for tree of size " <<
VectorizableTree.size() << ".\n");
unsigned BundleWidth = VectorizableTree[0].Scalars.size();
for (TreeEntry &TE : VectorizableTree) {
int C = getEntryCost(&TE);
DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle that starts with "
<< *TE.Scalars[0] << ".\n");
Cost += C;
}
SmallSet<Value *, 16> ExtractCostCalculated;
int ExtractCost = 0;
for (ExternalUser &EU : ExternalUses) {
// We only add extract cost once for the same scalar.
if (!ExtractCostCalculated.insert(EU.Scalar).second)
continue;
// Uses by ephemeral values are free (because the ephemeral value will be
// removed prior to code generation, and so the extraction will be
// removed as well).
if (EphValues.count(EU.User))
continue;
// If we plan to rewrite the tree in a smaller type, we will need to sign
// extend the extracted value back to the original type. Here, we account
// for the extract and the added cost of the sign extend if needed.
auto *VecTy = VectorType::get(EU.Scalar->getType(), BundleWidth);
auto *ScalarRoot = VectorizableTree[0].Scalars[0];
if (MinBWs.count(ScalarRoot)) {
auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot].first);
auto Extend =
MinBWs[ScalarRoot].second ? Instruction::SExt : Instruction::ZExt;
VecTy = VectorType::get(MinTy, BundleWidth);
ExtractCost += TTI->getExtractWithExtendCost(Extend, EU.Scalar->getType(),
VecTy, EU.Lane);
} else {
ExtractCost +=
TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, EU.Lane);
}
}
int SpillCost = getSpillCost();
Cost += SpillCost + ExtractCost;
DEBUG(dbgs() << "SLP: Spill Cost = " << SpillCost << ".\n"
<< "SLP: Extract Cost = " << ExtractCost << ".\n"
<< "SLP: Total Cost = " << Cost << ".\n");
return Cost;
}
int BoUpSLP::getGatherCost(Type *Ty) {
int Cost = 0;
for (unsigned i = 0, e = cast<VectorType>(Ty)->getNumElements(); i < e; ++i)
Cost += TTI->getVectorInstrCost(Instruction::InsertElement, Ty, i);
return Cost;
}
int BoUpSLP::getGatherCost(ArrayRef<Value *> VL) {
// Find the type of the operands in VL.
Type *ScalarTy = VL[0]->getType();
if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
ScalarTy = SI->getValueOperand()->getType();
VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
// Find the cost of inserting/extracting values from the vector.
return getGatherCost(VecTy);
}
// Reorder commutative operations in alternate shuffle if the resulting vectors
// are consecutive loads. This would allow us to vectorize the tree.
// If we have something like-
// load a[0] - load b[0]
// load b[1] + load a[1]
// load a[2] - load b[2]
// load a[3] + load b[3]
// Reordering the second load b[1] load a[1] would allow us to vectorize this
// code.
void BoUpSLP::reorderAltShuffleOperands(ArrayRef<Value *> VL,
SmallVectorImpl<Value *> &Left,
SmallVectorImpl<Value *> &Right) {
// Push left and right operands of binary operation into Left and Right
for (Value *i : VL) {
Left.push_back(cast<Instruction>(i)->getOperand(0));
Right.push_back(cast<Instruction>(i)->getOperand(1));
}
// Reorder if we have a commutative operation and consecutive access
// are on either side of the alternate instructions.
for (unsigned j = 0; j < VL.size() - 1; ++j) {
if (LoadInst *L = dyn_cast<LoadInst>(Left[j])) {
if (LoadInst *L1 = dyn_cast<LoadInst>(Right[j + 1])) {
Instruction *VL1 = cast<Instruction>(VL[j]);
Instruction *VL2 = cast<Instruction>(VL[j + 1]);
if (VL1->isCommutative() && isConsecutiveAccess(L, L1, *DL, *SE)) {
std::swap(Left[j], Right[j]);
continue;
} else if (VL2->isCommutative() &&
isConsecutiveAccess(L, L1, *DL, *SE)) {
std::swap(Left[j + 1], Right[j + 1]);
continue;
}
// else unchanged
}
}
if (LoadInst *L = dyn_cast<LoadInst>(Right[j])) {
if (LoadInst *L1 = dyn_cast<LoadInst>(Left[j + 1])) {
Instruction *VL1 = cast<Instruction>(VL[j]);
Instruction *VL2 = cast<Instruction>(VL[j + 1]);
if (VL1->isCommutative() && isConsecutiveAccess(L, L1, *DL, *SE)) {
std::swap(Left[j], Right[j]);
continue;
} else if (VL2->isCommutative() &&
isConsecutiveAccess(L, L1, *DL, *SE)) {
std::swap(Left[j + 1], Right[j + 1]);
continue;
}
// else unchanged
}
}
}
}
// Return true if I should be commuted before adding it's left and right
// operands to the arrays Left and Right.
//
// The vectorizer is trying to either have all elements one side being
// instruction with the same opcode to enable further vectorization, or having
// a splat to lower the vectorizing cost.
static bool shouldReorderOperands(int i, Instruction &I,
SmallVectorImpl<Value *> &Left,
SmallVectorImpl<Value *> &Right,
bool AllSameOpcodeLeft,
bool AllSameOpcodeRight, bool SplatLeft,
bool SplatRight) {
Value *VLeft = I.getOperand(0);
Value *VRight = I.getOperand(1);
// If we have "SplatRight", try to see if commuting is needed to preserve it.
if (SplatRight) {
if (VRight == Right[i - 1])
// Preserve SplatRight
return false;
if (VLeft == Right[i - 1]) {
// Commuting would preserve SplatRight, but we don't want to break
// SplatLeft either, i.e. preserve the original order if possible.
// (FIXME: why do we care?)
if (SplatLeft && VLeft == Left[i - 1])
return false;
return true;
}
}
// Symmetrically handle Right side.
if (SplatLeft) {
if (VLeft == Left[i - 1])
// Preserve SplatLeft
return false;
if (VRight == Left[i - 1])
return true;
}
Instruction *ILeft = dyn_cast<Instruction>(VLeft);
Instruction *IRight = dyn_cast<Instruction>(VRight);
// If we have "AllSameOpcodeRight", try to see if the left operands preserves
// it and not the right, in this case we want to commute.
if (AllSameOpcodeRight) {
unsigned RightPrevOpcode = cast<Instruction>(Right[i - 1])->getOpcode();
if (IRight && RightPrevOpcode == IRight->getOpcode())
// Do not commute, a match on the right preserves AllSameOpcodeRight
return false;
if (ILeft && RightPrevOpcode == ILeft->getOpcode()) {
// We have a match and may want to commute, but first check if there is
// not also a match on the existing operands on the Left to preserve
// AllSameOpcodeLeft, i.e. preserve the original order if possible.
// (FIXME: why do we care?)
if (AllSameOpcodeLeft && ILeft &&
cast<Instruction>(Left[i - 1])->getOpcode() == ILeft->getOpcode())
return false;
return true;
}
}
// Symmetrically handle Left side.
if (AllSameOpcodeLeft) {
unsigned LeftPrevOpcode = cast<Instruction>(Left[i - 1])->getOpcode();
if (ILeft && LeftPrevOpcode == ILeft->getOpcode())
return false;
if (IRight && LeftPrevOpcode == IRight->getOpcode())
return true;
}
return false;
}
void BoUpSLP::reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
SmallVectorImpl<Value *> &Left,
SmallVectorImpl<Value *> &Right) {
if (VL.size()) {
// Peel the first iteration out of the loop since there's nothing
// interesting to do anyway and it simplifies the checks in the loop.
auto VLeft = cast<Instruction>(VL[0])->getOperand(0);
auto VRight = cast<Instruction>(VL[0])->getOperand(1);
if (!isa<Instruction>(VRight) && isa<Instruction>(VLeft))
// Favor having instruction to the right. FIXME: why?
std::swap(VLeft, VRight);
Left.push_back(VLeft);
Right.push_back(VRight);
}
// Keep track if we have instructions with all the same opcode on one side.
bool AllSameOpcodeLeft = isa<Instruction>(Left[0]);
bool AllSameOpcodeRight = isa<Instruction>(Right[0]);
// Keep track if we have one side with all the same value (broadcast).
bool SplatLeft = true;
bool SplatRight = true;
for (unsigned i = 1, e = VL.size(); i != e; ++i) {
Instruction *I = cast<Instruction>(VL[i]);
assert(I->isCommutative() && "Can only process commutative instruction");
// Commute to favor either a splat or maximizing having the same opcodes on
// one side.
if (shouldReorderOperands(i, *I, Left, Right, AllSameOpcodeLeft,
AllSameOpcodeRight, SplatLeft, SplatRight)) {
Left.push_back(I->getOperand(1));
Right.push_back(I->getOperand(0));
} else {
Left.push_back(I->getOperand(0));
Right.push_back(I->getOperand(1));
}
// Update Splat* and AllSameOpcode* after the insertion.
SplatRight = SplatRight && (Right[i - 1] == Right[i]);
SplatLeft = SplatLeft && (Left[i - 1] == Left[i]);
AllSameOpcodeLeft = AllSameOpcodeLeft && isa<Instruction>(Left[i]) &&
(cast<Instruction>(Left[i - 1])->getOpcode() ==
cast<Instruction>(Left[i])->getOpcode());
AllSameOpcodeRight = AllSameOpcodeRight && isa<Instruction>(Right[i]) &&
(cast<Instruction>(Right[i - 1])->getOpcode() ==
cast<Instruction>(Right[i])->getOpcode());
}
// If one operand end up being broadcast, return this operand order.
if (SplatRight || SplatLeft)
return;
// Finally check if we can get longer vectorizable chain by reordering
// without breaking the good operand order detected above.
// E.g. If we have something like-
// load a[0] load b[0]
// load b[1] load a[1]
// load a[2] load b[2]
// load a[3] load b[3]
// Reordering the second load b[1] load a[1] would allow us to vectorize
// this code and we still retain AllSameOpcode property.
// FIXME: This load reordering might break AllSameOpcode in some rare cases
// such as-
// add a[0],c[0] load b[0]
// add a[1],c[2] load b[1]
// b[2] load b[2]
// add a[3],c[3] load b[3]
for (unsigned j = 0; j < VL.size() - 1; ++j) {
if (LoadInst *L = dyn_cast<LoadInst>(Left[j])) {
if (LoadInst *L1 = dyn_cast<LoadInst>(Right[j + 1])) {
if (isConsecutiveAccess(L, L1, *DL, *SE)) {
std::swap(Left[j + 1], Right[j + 1]);
continue;
}
}
}
if (LoadInst *L = dyn_cast<LoadInst>(Right[j])) {
if (LoadInst *L1 = dyn_cast<LoadInst>(Left[j + 1])) {
if (isConsecutiveAccess(L, L1, *DL, *SE)) {
std::swap(Left[j + 1], Right[j + 1]);
continue;
}
}
}
// else unchanged
}
}
void BoUpSLP::setInsertPointAfterBundle(ArrayRef<Value *> VL) {
// Get the basic block this bundle is in. All instructions in the bundle
// should be in this block.
auto *Front = cast<Instruction>(VL.front());
auto *BB = Front->getParent();
assert(all_of(make_range(VL.begin(), VL.end()), [&](Value *V) -> bool {
return cast<Instruction>(V)->getParent() == BB;
}));
// The last instruction in the bundle in program order.
Instruction *LastInst = nullptr;
// Find the last instruction. The common case should be that BB has been
// scheduled, and the last instruction is VL.back(). So we start with
// VL.back() and iterate over schedule data until we reach the end of the
// bundle. The end of the bundle is marked by null ScheduleData.
if (BlocksSchedules.count(BB)) {
auto *Bundle = BlocksSchedules[BB]->getScheduleData(VL.back());
if (Bundle && Bundle->isPartOfBundle())
for (; Bundle; Bundle = Bundle->NextInBundle)
LastInst = Bundle->Inst;
}
// LastInst can still be null at this point if there's either not an entry
// for BB in BlocksSchedules or there's no ScheduleData available for
// VL.back(). This can be the case if buildTree_rec aborts for various
// reasons (e.g., the maximum recursion depth is reached, the maximum region
// size is reached, etc.). ScheduleData is initialized in the scheduling
// "dry-run".
//
// If this happens, we can still find the last instruction by brute force. We
// iterate forwards from Front (inclusive) until we either see all
// instructions in the bundle or reach the end of the block. If Front is the
// last instruction in program order, LastInst will be set to Front, and we
// will visit all the remaining instructions in the block.
//
// One of the reasons we exit early from buildTree_rec is to place an upper
// bound on compile-time. Thus, taking an additional compile-time hit here is
// not ideal. However, this should be exceedingly rare since it requires that
// we both exit early from buildTree_rec and that the bundle be out-of-order
// (causing us to iterate all the way to the end of the block).
if (!LastInst) {
SmallPtrSet<Value *, 16> Bundle(VL.begin(), VL.end());
for (auto &I : make_range(BasicBlock::iterator(Front), BB->end())) {
if (Bundle.erase(&I))
LastInst = &I;
if (Bundle.empty())
break;
}
}
// Set the insertion point after the last instruction in the bundle. Set the
// debug location to Front.
Builder.SetInsertPoint(BB, ++LastInst->getIterator());
Builder.SetCurrentDebugLocation(Front->getDebugLoc());
}
Value *BoUpSLP::Gather(ArrayRef<Value *> VL, VectorType *Ty) {
Value *Vec = UndefValue::get(Ty);
// Generate the 'InsertElement' instruction.
for (unsigned i = 0; i < Ty->getNumElements(); ++i) {
Vec = Builder.CreateInsertElement(Vec, VL[i], Builder.getInt32(i));
if (Instruction *Insrt = dyn_cast<Instruction>(Vec)) {
GatherSeq.insert(Insrt);
CSEBlocks.insert(Insrt->getParent());
// Add to our 'need-to-extract' list.
if (ScalarToTreeEntry.count(VL[i])) {
int Idx = ScalarToTreeEntry[VL[i]];
TreeEntry *E = &VectorizableTree[Idx];
// Find which lane we need to extract.
int FoundLane = -1;
for (unsigned Lane = 0, LE = VL.size(); Lane != LE; ++Lane) {
// Is this the lane of the scalar that we are looking for ?
if (E->Scalars[Lane] == VL[i]) {
FoundLane = Lane;
break;
}
}
assert(FoundLane >= 0 && "Could not find the correct lane");
ExternalUses.push_back(ExternalUser(VL[i], Insrt, FoundLane));
}
}
}
return Vec;
}
Value *BoUpSLP::alreadyVectorized(ArrayRef<Value *> VL) const {
SmallDenseMap<Value*, int>::const_iterator Entry
= ScalarToTreeEntry.find(VL[0]);
if (Entry != ScalarToTreeEntry.end()) {
int Idx = Entry->second;
const TreeEntry *En = &VectorizableTree[Idx];
if (En->isSame(VL) && En->VectorizedValue)
return En->VectorizedValue;
}
return nullptr;
}
Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) {
if (ScalarToTreeEntry.count(VL[0])) {
int Idx = ScalarToTreeEntry[VL[0]];
TreeEntry *E = &VectorizableTree[Idx];
if (E->isSame(VL))
return vectorizeTree(E);
}
Type *ScalarTy = VL[0]->getType();
if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
ScalarTy = SI->getValueOperand()->getType();
VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
return Gather(VL, VecTy);
}
Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
IRBuilder<>::InsertPointGuard Guard(Builder);
if (E->VectorizedValue) {
DEBUG(dbgs() << "SLP: Diamond merged for " << *E->Scalars[0] << ".\n");
return E->VectorizedValue;
}
Instruction *VL0 = cast<Instruction>(E->Scalars[0]);
Type *ScalarTy = VL0->getType();
if (StoreInst *SI = dyn_cast<StoreInst>(VL0))
ScalarTy = SI->getValueOperand()->getType();
VectorType *VecTy = VectorType::get(ScalarTy, E->Scalars.size());
if (E->NeedToGather) {
setInsertPointAfterBundle(E->Scalars);
auto *V = Gather(E->Scalars, VecTy);
E->VectorizedValue = V;
return V;
}
unsigned Opcode = getSameOpcode(E->Scalars);
switch (Opcode) {
case Instruction::PHI: {
PHINode *PH = dyn_cast<PHINode>(VL0);
Builder.SetInsertPoint(PH->getParent()->getFirstNonPHI());
Builder.SetCurrentDebugLocation(PH->getDebugLoc());
PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues());
E->VectorizedValue = NewPhi;
// PHINodes may have multiple entries from the same block. We want to
// visit every block once.
SmallSet<BasicBlock*, 4> VisitedBBs;
for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
ValueList Operands;
BasicBlock *IBB = PH->getIncomingBlock(i);
if (!VisitedBBs.insert(IBB).second) {
NewPhi->addIncoming(NewPhi->getIncomingValueForBlock(IBB), IBB);
continue;
}
// Prepare the operand vector.
for (Value *V : E->Scalars)
Operands.push_back(cast<PHINode>(V)->getIncomingValueForBlock(IBB));
Builder.SetInsertPoint(IBB->getTerminator());
Builder.SetCurrentDebugLocation(PH->getDebugLoc());
Value *Vec = vectorizeTree(Operands);
NewPhi->addIncoming(Vec, IBB);
}
assert(NewPhi->getNumIncomingValues() == PH->getNumIncomingValues() &&
"Invalid number of incoming values");
return NewPhi;
}
case Instruction::ExtractElement: {
if (canReuseExtract(E->Scalars, Instruction::ExtractElement)) {
Value *V = VL0->getOperand(0);
E->VectorizedValue = V;
return V;
}
setInsertPointAfterBundle(E->Scalars);
auto *V = Gather(E->Scalars, VecTy);
E->VectorizedValue = V;
return V;
}
case Instruction::ExtractValue: {
if (canReuseExtract(E->Scalars, Instruction::ExtractValue)) {
LoadInst *LI = cast<LoadInst>(VL0->getOperand(0));
Builder.SetInsertPoint(LI);
PointerType *PtrTy = PointerType::get(VecTy, LI->getPointerAddressSpace());
Value *Ptr = Builder.CreateBitCast(LI->getOperand(0), PtrTy);
LoadInst *V = Builder.CreateAlignedLoad(Ptr, LI->getAlignment());
E->VectorizedValue = V;
return propagateMetadata(V, E->Scalars);
}
setInsertPointAfterBundle(E->Scalars);
auto *V = Gather(E->Scalars, VecTy);
E->VectorizedValue = V;
return V;
}
case Instruction::ZExt:
case Instruction::SExt:
case Instruction::FPToUI:
case Instruction::FPToSI:
case Instruction::FPExt:
case Instruction::PtrToInt:
case Instruction::IntToPtr:
case Instruction::SIToFP:
case Instruction::UIToFP:
case Instruction::Trunc:
case Instruction::FPTrunc:
case Instruction::BitCast: {
ValueList INVL;
for (Value *V : E->Scalars)
INVL.push_back(cast<Instruction>(V)->getOperand(0));
setInsertPointAfterBundle(E->Scalars);
Value *InVec = vectorizeTree(INVL);
if (Value *V = alreadyVectorized(E->Scalars))
return V;
CastInst *CI = dyn_cast<CastInst>(VL0);
Value *V = Builder.CreateCast(CI->getOpcode(), InVec, VecTy);
E->VectorizedValue = V;
++NumVectorInstructions;
return V;
}
case Instruction::FCmp:
case Instruction::ICmp: {
ValueList LHSV, RHSV;
for (Value *V : E->Scalars) {
LHSV.push_back(cast<Instruction>(V)->getOperand(0));
RHSV.push_back(cast<Instruction>(V)->getOperand(1));
}
setInsertPointAfterBundle(E->Scalars);
Value *L = vectorizeTree(LHSV);
Value *R = vectorizeTree(RHSV);
if (Value *V = alreadyVectorized(E->Scalars))
return V;
CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
Value *V;
if (Opcode == Instruction::FCmp)
V = Builder.CreateFCmp(P0, L, R);
else
V = Builder.CreateICmp(P0, L, R);
E->VectorizedValue = V;
propagateIRFlags(E->VectorizedValue, E->Scalars);
++NumVectorInstructions;
return V;
}
case Instruction::Select: {
ValueList TrueVec, FalseVec, CondVec;
for (Value *V : E->Scalars) {
CondVec.push_back(cast<Instruction>(V)->getOperand(0));
TrueVec.push_back(cast<Instruction>(V)->getOperand(1));
FalseVec.push_back(cast<Instruction>(V)->getOperand(2));
}
setInsertPointAfterBundle(E->Scalars);
Value *Cond = vectorizeTree(CondVec);
Value *True = vectorizeTree(TrueVec);
Value *False = vectorizeTree(FalseVec);
if (Value *V = alreadyVectorized(E->Scalars))
return V;
Value *V = Builder.CreateSelect(Cond, True, False);
E->VectorizedValue = V;
++NumVectorInstructions;
return V;
}
case Instruction::Add:
case Instruction::FAdd:
case Instruction::Sub:
case Instruction::FSub:
case Instruction::Mul:
case Instruction::FMul:
case Instruction::UDiv:
case Instruction::SDiv:
case Instruction::FDiv:
case Instruction::URem:
case Instruction::SRem:
case Instruction::FRem:
case Instruction::Shl:
case Instruction::LShr:
case Instruction::AShr:
case Instruction::And:
case Instruction::Or:
case Instruction::Xor: {
ValueList LHSVL, RHSVL;
if (isa<BinaryOperator>(VL0) && VL0->isCommutative())
reorderInputsAccordingToOpcode(E->Scalars, LHSVL, RHSVL);
else
for (Value *V : E->Scalars) {
LHSVL.push_back(cast<Instruction>(V)->getOperand(0));
RHSVL.push_back(cast<Instruction>(V)->getOperand(1));
}
setInsertPointAfterBundle(E->Scalars);
Value *LHS = vectorizeTree(LHSVL);
Value *RHS = vectorizeTree(RHSVL);
if (Value *V = alreadyVectorized(E->Scalars))
return V;
BinaryOperator *BinOp = cast<BinaryOperator>(VL0);
Value *V = Builder.CreateBinOp(BinOp->getOpcode(), LHS, RHS);
E->VectorizedValue = V;
propagateIRFlags(E->VectorizedValue, E->Scalars);
++NumVectorInstructions;
if (Instruction *I = dyn_cast<Instruction>(V))
return propagateMetadata(I, E->Scalars);
return V;
}
case Instruction::Load: {
// Loads are inserted at the head of the tree because we don't want to
// sink them all the way down past store instructions.
setInsertPointAfterBundle(E->Scalars);
LoadInst *LI = cast<LoadInst>(VL0);
Type *ScalarLoadTy = LI->getType();
unsigned AS = LI->getPointerAddressSpace();
Value *VecPtr = Builder.CreateBitCast(LI->getPointerOperand(),
VecTy->getPointerTo(AS));
// The pointer operand uses an in-tree scalar so we add the new BitCast to
// ExternalUses list to make sure that an extract will be generated in the
// future.
if (ScalarToTreeEntry.count(LI->getPointerOperand()))
ExternalUses.push_back(
ExternalUser(LI->getPointerOperand(), cast<User>(VecPtr), 0));
unsigned Alignment = LI->getAlignment();
LI = Builder.CreateLoad(VecPtr);
if (!Alignment) {
Alignment = DL->getABITypeAlignment(ScalarLoadTy);
}
LI->setAlignment(Alignment);
E->VectorizedValue = LI;
++NumVectorInstructions;
return propagateMetadata(LI, E->Scalars);
}
case Instruction::Store: {
StoreInst *SI = cast<StoreInst>(VL0);
unsigned Alignment = SI->getAlignment();
unsigned AS = SI->getPointerAddressSpace();
ValueList ValueOp;
for (Value *V : E->Scalars)
ValueOp.push_back(cast<StoreInst>(V)->getValueOperand());
setInsertPointAfterBundle(E->Scalars);
Value *VecValue = vectorizeTree(ValueOp);
Value *VecPtr = Builder.CreateBitCast(SI->getPointerOperand(),
VecTy->getPointerTo(AS));
StoreInst *S = Builder.CreateStore(VecValue, VecPtr);
// The pointer operand uses an in-tree scalar so we add the new BitCast to
// ExternalUses list to make sure that an extract will be generated in the
// future.
if (ScalarToTreeEntry.count(SI->getPointerOperand()))
ExternalUses.push_back(
ExternalUser(SI->getPointerOperand(), cast<User>(VecPtr), 0));
if (!Alignment) {
Alignment = DL->getABITypeAlignment(SI->getValueOperand()->getType());
}
S->setAlignment(Alignment);
E->VectorizedValue = S;
++NumVectorInstructions;
return propagateMetadata(S, E->Scalars);
}
case Instruction::GetElementPtr: {
setInsertPointAfterBundle(E->Scalars);
ValueList Op0VL;
for (Value *V : E->Scalars)
Op0VL.push_back(cast<GetElementPtrInst>(V)->getOperand(0));
Value *Op0 = vectorizeTree(Op0VL);
std::vector<Value *> OpVecs;
for (int j = 1, e = cast<GetElementPtrInst>(VL0)->getNumOperands(); j < e;
++j) {
ValueList OpVL;
for (Value *V : E->Scalars)
OpVL.push_back(cast<GetElementPtrInst>(V)->getOperand(j));
Value *OpVec = vectorizeTree(OpVL);
OpVecs.push_back(OpVec);
}
Value *V = Builder.CreateGEP(
cast<GetElementPtrInst>(VL0)->getSourceElementType(), Op0, OpVecs);
E->VectorizedValue = V;
++NumVectorInstructions;
if (Instruction *I = dyn_cast<Instruction>(V))
return propagateMetadata(I, E->Scalars);
return V;
}
case Instruction::Call: {
CallInst *CI = cast<CallInst>(VL0);
setInsertPointAfterBundle(E->Scalars);
Function *FI;
Intrinsic::ID IID = Intrinsic::not_intrinsic;
Value *ScalarArg = nullptr;
if (CI && (FI = CI->getCalledFunction())) {
IID = FI->getIntrinsicID();
}
std::vector<Value *> OpVecs;
for (int j = 0, e = CI->getNumArgOperands(); j < e; ++j) {
ValueList OpVL;
// ctlz,cttz and powi are special intrinsics whose second argument is
// a scalar. This argument should not be vectorized.
if (hasVectorInstrinsicScalarOpd(IID, 1) && j == 1) {
CallInst *CEI = cast<CallInst>(E->Scalars[0]);
ScalarArg = CEI->getArgOperand(j);
OpVecs.push_back(CEI->getArgOperand(j));
continue;
}
for (Value *V : E->Scalars) {
CallInst *CEI = cast<CallInst>(V);
OpVL.push_back(CEI->getArgOperand(j));
}
Value *OpVec = vectorizeTree(OpVL);
DEBUG(dbgs() << "SLP: OpVec[" << j << "]: " << *OpVec << "\n");
OpVecs.push_back(OpVec);
}
Module *M = F->getParent();
Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
Type *Tys[] = { VectorType::get(CI->getType(), E->Scalars.size()) };
Function *CF = Intrinsic::getDeclaration(M, ID, Tys);
SmallVector<OperandBundleDef, 1> OpBundles;
CI->getOperandBundlesAsDefs(OpBundles);
Value *V = Builder.CreateCall(CF, OpVecs, OpBundles);
// The scalar argument uses an in-tree scalar so we add the new vectorized
// call to ExternalUses list to make sure that an extract will be
// generated in the future.
if (ScalarArg && ScalarToTreeEntry.count(ScalarArg))
ExternalUses.push_back(ExternalUser(ScalarArg, cast<User>(V), 0));
E->VectorizedValue = V;
propagateIRFlags(E->VectorizedValue, E->Scalars);
++NumVectorInstructions;
return V;
}
case Instruction::ShuffleVector: {
ValueList LHSVL, RHSVL;
assert(isa<BinaryOperator>(VL0) && "Invalid Shuffle Vector Operand");
reorderAltShuffleOperands(E->Scalars, LHSVL, RHSVL);
setInsertPointAfterBundle(E->Scalars);
Value *LHS = vectorizeTree(LHSVL);
Value *RHS = vectorizeTree(RHSVL);
if (Value *V = alreadyVectorized(E->Scalars))
return V;
// Create a vector of LHS op1 RHS
BinaryOperator *BinOp0 = cast<BinaryOperator>(VL0);
Value *V0 = Builder.CreateBinOp(BinOp0->getOpcode(), LHS, RHS);
// Create a vector of LHS op2 RHS
Instruction *VL1 = cast<Instruction>(E->Scalars[1]);
BinaryOperator *BinOp1 = cast<BinaryOperator>(VL1);
Value *V1 = Builder.CreateBinOp(BinOp1->getOpcode(), LHS, RHS);
// Create shuffle to take alternate operations from the vector.
// Also, gather up odd and even scalar ops to propagate IR flags to
// each vector operation.
ValueList OddScalars, EvenScalars;
unsigned e = E->Scalars.size();
SmallVector<Constant *, 8> Mask(e);
for (unsigned i = 0; i < e; ++i) {
if (i & 1) {
Mask[i] = Builder.getInt32(e + i);
OddScalars.push_back(E->Scalars[i]);
} else {
Mask[i] = Builder.getInt32(i);
EvenScalars.push_back(E->Scalars[i]);
}
}
Value *ShuffleMask = ConstantVector::get(Mask);
propagateIRFlags(V0, EvenScalars);
propagateIRFlags(V1, OddScalars);
Value *V = Builder.CreateShuffleVector(V0, V1, ShuffleMask);
E->VectorizedValue = V;
++NumVectorInstructions;
if (Instruction *I = dyn_cast<Instruction>(V))
return propagateMetadata(I, E->Scalars);
return V;
}
default:
llvm_unreachable("unknown inst");
}
return nullptr;
}
Value *BoUpSLP::vectorizeTree() {
// All blocks must be scheduled before any instructions are inserted.
for (auto &BSIter : BlocksSchedules) {
scheduleBlock(BSIter.second.get());
}
Builder.SetInsertPoint(&F->getEntryBlock().front());
auto *VectorRoot = vectorizeTree(&VectorizableTree[0]);
// If the vectorized tree can be rewritten in a smaller type, we truncate the
// vectorized root. InstCombine will then rewrite the entire expression. We
// sign extend the extracted values below.
auto *ScalarRoot = VectorizableTree[0].Scalars[0];
if (MinBWs.count(ScalarRoot)) {
if (auto *I = dyn_cast<Instruction>(VectorRoot))
Builder.SetInsertPoint(&*++BasicBlock::iterator(I));
auto BundleWidth = VectorizableTree[0].Scalars.size();
auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot].first);
auto *VecTy = VectorType::get(MinTy, BundleWidth);
auto *Trunc = Builder.CreateTrunc(VectorRoot, VecTy);
VectorizableTree[0].VectorizedValue = Trunc;
}
DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size() << " values .\n");
// If necessary, sign-extend or zero-extend ScalarRoot to the larger type
// specified by ScalarType.
auto extend = [&](Value *ScalarRoot, Value *Ex, Type *ScalarType) {
if (!MinBWs.count(ScalarRoot))
return Ex;
if (MinBWs[ScalarRoot].second)
return Builder.CreateSExt(Ex, ScalarType);
return Builder.CreateZExt(Ex, ScalarType);
};
// Extract all of the elements with the external uses.
for (const auto &ExternalUse : ExternalUses) {
Value *Scalar = ExternalUse.Scalar;
llvm::User *User = ExternalUse.User;
// Skip users that we already RAUW. This happens when one instruction
// has multiple uses of the same value.
if (!is_contained(Scalar->users(), User))
continue;
assert(ScalarToTreeEntry.count(Scalar) && "Invalid scalar");
int Idx = ScalarToTreeEntry[Scalar];
TreeEntry *E = &VectorizableTree[Idx];
assert(!E->NeedToGather && "Extracting from a gather list");
Value *Vec = E->VectorizedValue;
assert(Vec && "Can't find vectorizable value");
Value *Lane = Builder.getInt32(ExternalUse.Lane);
// Generate extracts for out-of-tree users.
// Find the insertion point for the extractelement lane.
if (auto *VecI = dyn_cast<Instruction>(Vec)) {
if (PHINode *PH = dyn_cast<PHINode>(User)) {
for (int i = 0, e = PH->getNumIncomingValues(); i != e; ++i) {
if (PH->getIncomingValue(i) == Scalar) {
TerminatorInst *IncomingTerminator =
PH->getIncomingBlock(i)->getTerminator();
if (isa<CatchSwitchInst>(IncomingTerminator)) {
Builder.SetInsertPoint(VecI->getParent(),
std::next(VecI->getIterator()));
} else {
Builder.SetInsertPoint(PH->getIncomingBlock(i)->getTerminator());
}
Value *Ex = Builder.CreateExtractElement(Vec, Lane);
Ex = extend(ScalarRoot, Ex, Scalar->getType());
CSEBlocks.insert(PH->getIncomingBlock(i));
PH->setOperand(i, Ex);
}
}
} else {
Builder.SetInsertPoint(cast<Instruction>(User));
Value *Ex = Builder.CreateExtractElement(Vec, Lane);
Ex = extend(ScalarRoot, Ex, Scalar->getType());
CSEBlocks.insert(cast<Instruction>(User)->getParent());
User->replaceUsesOfWith(Scalar, Ex);
}
} else {
Builder.SetInsertPoint(&F->getEntryBlock().front());
Value *Ex = Builder.CreateExtractElement(Vec, Lane);
Ex = extend(ScalarRoot, Ex, Scalar->getType());
CSEBlocks.insert(&F->getEntryBlock());
User->replaceUsesOfWith(Scalar, Ex);
}
DEBUG(dbgs() << "SLP: Replaced:" << *User << ".\n");
}
// For each vectorized value:
for (TreeEntry &EIdx : VectorizableTree) {
TreeEntry *Entry = &EIdx;
// For each lane:
for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
Value *Scalar = Entry->Scalars[Lane];
// No need to handle users of gathered values.
if (Entry->NeedToGather)
continue;
assert(Entry->VectorizedValue && "Can't find vectorizable value");
Type *Ty = Scalar->getType();
if (!Ty->isVoidTy()) {
#ifndef NDEBUG
for (User *U : Scalar->users()) {
DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n");
assert((ScalarToTreeEntry.count(U) ||
// It is legal to replace users in the ignorelist by undef.
is_contained(UserIgnoreList, U)) &&
"Replacing out-of-tree value with undef");
}
#endif
Value *Undef = UndefValue::get(Ty);
Scalar->replaceAllUsesWith(Undef);
}
DEBUG(dbgs() << "SLP: \tErasing scalar:" << *Scalar << ".\n");
eraseInstruction(cast<Instruction>(Scalar));
}
}
Builder.ClearInsertionPoint();
return VectorizableTree[0].VectorizedValue;
}
void BoUpSLP::optimizeGatherSequence() {
DEBUG(dbgs() << "SLP: Optimizing " << GatherSeq.size()
<< " gather sequences instructions.\n");
// LICM InsertElementInst sequences.
for (Instruction *it : GatherSeq) {
InsertElementInst *Insert = dyn_cast<InsertElementInst>(it);
if (!Insert)
continue;
// Check if this block is inside a loop.
Loop *L = LI->getLoopFor(Insert->getParent());
if (!L)
continue;
// Check if it has a preheader.
BasicBlock *PreHeader = L->getLoopPreheader();
if (!PreHeader)
continue;
// If the vector or the element that we insert into it are
// instructions that are defined in this basic block then we can't
// hoist this instruction.
Instruction *CurrVec = dyn_cast<Instruction>(Insert->getOperand(0));
Instruction *NewElem = dyn_cast<Instruction>(Insert->getOperand(1));
if (CurrVec && L->contains(CurrVec))
continue;
if (NewElem && L->contains(NewElem))
continue;
// We can hoist this instruction. Move it to the pre-header.
Insert->moveBefore(PreHeader->getTerminator());
}
// Make a list of all reachable blocks in our CSE queue.
SmallVector<const DomTreeNode *, 8> CSEWorkList;
CSEWorkList.reserve(CSEBlocks.size());
for (BasicBlock *BB : CSEBlocks)
if (DomTreeNode *N = DT->getNode(BB)) {
assert(DT->isReachableFromEntry(N));
CSEWorkList.push_back(N);
}
// Sort blocks by domination. This ensures we visit a block after all blocks
// dominating it are visited.
std::stable_sort(CSEWorkList.begin(), CSEWorkList.end(),
[this](const DomTreeNode *A, const DomTreeNode *B) {
return DT->properlyDominates(A, B);
});
// Perform O(N^2) search over the gather sequences and merge identical
// instructions. TODO: We can further optimize this scan if we split the
// instructions into different buckets based on the insert lane.
SmallVector<Instruction *, 16> Visited;
for (auto I = CSEWorkList.begin(), E = CSEWorkList.end(); I != E; ++I) {
assert((I == CSEWorkList.begin() || !DT->dominates(*I, *std::prev(I))) &&
"Worklist not sorted properly!");
BasicBlock *BB = (*I)->getBlock();
// For all instructions in blocks containing gather sequences:
for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e;) {
Instruction *In = &*it++;
if (!isa<InsertElementInst>(In) && !isa<ExtractElementInst>(In))
continue;
// Check if we can replace this instruction with any of the
// visited instructions.
for (Instruction *v : Visited) {
if (In->isIdenticalTo(v) &&
DT->dominates(v->getParent(), In->getParent())) {
In->replaceAllUsesWith(v);
eraseInstruction(In);
In = nullptr;
break;
}
}
if (In) {
assert(!is_contained(Visited, In));
Visited.push_back(In);
}
}
}
CSEBlocks.clear();
GatherSeq.clear();
}
// Groups the instructions to a bundle (which is then a single scheduling entity)
// and schedules instructions until the bundle gets ready.
bool BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL,
BoUpSLP *SLP) {
if (isa<PHINode>(VL[0]))
return true;
// Initialize the instruction bundle.
Instruction *OldScheduleEnd = ScheduleEnd;
ScheduleData *PrevInBundle = nullptr;
ScheduleData *Bundle = nullptr;
bool ReSchedule = false;
DEBUG(dbgs() << "SLP: bundle: " << *VL[0] << "\n");
// Make sure that the scheduling region contains all
// instructions of the bundle.
for (Value *V : VL) {
if (!extendSchedulingRegion(V))
return false;
}
for (Value *V : VL) {
ScheduleData *BundleMember = getScheduleData(V);
assert(BundleMember &&
"no ScheduleData for bundle member (maybe not in same basic block)");
if (BundleMember->IsScheduled) {
// A bundle member was scheduled as single instruction before and now
// needs to be scheduled as part of the bundle. We just get rid of the
// existing schedule.
DEBUG(dbgs() << "SLP: reset schedule because " << *BundleMember
<< " was already scheduled\n");
ReSchedule = true;
}
assert(BundleMember->isSchedulingEntity() &&
"bundle member already part of other bundle");
if (PrevInBundle) {
PrevInBundle->NextInBundle = BundleMember;
} else {
Bundle = BundleMember;
}
BundleMember->UnscheduledDepsInBundle = 0;
Bundle->UnscheduledDepsInBundle += BundleMember->UnscheduledDeps;
// Group the instructions to a bundle.
BundleMember->FirstInBundle = Bundle;
PrevInBundle = BundleMember;
}
if (ScheduleEnd != OldScheduleEnd) {
// The scheduling region got new instructions at the lower end (or it is a
// new region for the first bundle). This makes it necessary to
// recalculate all dependencies.
// It is seldom that this needs to be done a second time after adding the
// initial bundle to the region.
for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
ScheduleData *SD = getScheduleData(I);
SD->clearDependencies();
}
ReSchedule = true;
}
if (ReSchedule) {
resetSchedule();
initialFillReadyList(ReadyInsts);
}
DEBUG(dbgs() << "SLP: try schedule bundle " << *Bundle << " in block "
<< BB->getName() << "\n");
calculateDependencies(Bundle, true, SLP);
// Now try to schedule the new bundle. As soon as the bundle is "ready" it
// means that there are no cyclic dependencies and we can schedule it.
// Note that's important that we don't "schedule" the bundle yet (see
// cancelScheduling).
while (!Bundle->isReady() && !ReadyInsts.empty()) {
ScheduleData *pickedSD = ReadyInsts.back();
ReadyInsts.pop_back();
if (pickedSD->isSchedulingEntity() && pickedSD->isReady()) {
schedule(pickedSD, ReadyInsts);
}
}
if (!Bundle->isReady()) {
cancelScheduling(VL);
return false;
}
return true;
}
void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef<Value *> VL) {
if (isa<PHINode>(VL[0]))
return;
ScheduleData *Bundle = getScheduleData(VL[0]);
DEBUG(dbgs() << "SLP: cancel scheduling of " << *Bundle << "\n");
assert(!Bundle->IsScheduled &&
"Can't cancel bundle which is already scheduled");
assert(Bundle->isSchedulingEntity() && Bundle->isPartOfBundle() &&
"tried to unbundle something which is not a bundle");
// Un-bundle: make single instructions out of the bundle.
ScheduleData *BundleMember = Bundle;
while (BundleMember) {
assert(BundleMember->FirstInBundle == Bundle && "corrupt bundle links");
BundleMember->FirstInBundle = BundleMember;
ScheduleData *Next = BundleMember->NextInBundle;
BundleMember->NextInBundle = nullptr;
BundleMember->UnscheduledDepsInBundle = BundleMember->UnscheduledDeps;
if (BundleMember->UnscheduledDepsInBundle == 0) {
ReadyInsts.insert(BundleMember);
}
BundleMember = Next;
}
}
bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V) {
if (getScheduleData(V))
return true;
Instruction *I = dyn_cast<Instruction>(V);
assert(I && "bundle member must be an instruction");
assert(!isa<PHINode>(I) && "phi nodes don't need to be scheduled");
if (!ScheduleStart) {
// It's the first instruction in the new region.
initScheduleData(I, I->getNextNode(), nullptr, nullptr);
ScheduleStart = I;
ScheduleEnd = I->getNextNode();
assert(ScheduleEnd && "tried to vectorize a TerminatorInst?");
DEBUG(dbgs() << "SLP: initialize schedule region to " << *I << "\n");
return true;
}
// Search up and down at the same time, because we don't know if the new
// instruction is above or below the existing scheduling region.
BasicBlock::reverse_iterator UpIter =
++ScheduleStart->getIterator().getReverse();
BasicBlock::reverse_iterator UpperEnd = BB->rend();
BasicBlock::iterator DownIter = ScheduleEnd->getIterator();
BasicBlock::iterator LowerEnd = BB->end();
for (;;) {
if (++ScheduleRegionSize > ScheduleRegionSizeLimit) {
DEBUG(dbgs() << "SLP: exceeded schedule region size limit\n");
return false;
}
if (UpIter != UpperEnd) {
if (&*UpIter == I) {
initScheduleData(I, ScheduleStart, nullptr, FirstLoadStoreInRegion);
ScheduleStart = I;
DEBUG(dbgs() << "SLP: extend schedule region start to " << *I << "\n");
return true;
}
UpIter++;
}
if (DownIter != LowerEnd) {
if (&*DownIter == I) {
initScheduleData(ScheduleEnd, I->getNextNode(), LastLoadStoreInRegion,
nullptr);
ScheduleEnd = I->getNextNode();
assert(ScheduleEnd && "tried to vectorize a TerminatorInst?");
DEBUG(dbgs() << "SLP: extend schedule region end to " << *I << "\n");
return true;
}
DownIter++;
}
assert((UpIter != UpperEnd || DownIter != LowerEnd) &&
"instruction not found in block");
}
return true;
}
void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI,
Instruction *ToI,
ScheduleData *PrevLoadStore,
ScheduleData *NextLoadStore) {
ScheduleData *CurrentLoadStore = PrevLoadStore;
for (Instruction *I = FromI; I != ToI; I = I->getNextNode()) {
ScheduleData *SD = ScheduleDataMap[I];
if (!SD) {
// Allocate a new ScheduleData for the instruction.
if (ChunkPos >= ChunkSize) {
ScheduleDataChunks.push_back(
llvm::make_unique<ScheduleData[]>(ChunkSize));
ChunkPos = 0;
}
SD = &(ScheduleDataChunks.back()[ChunkPos++]);
ScheduleDataMap[I] = SD;
SD->Inst = I;
}
assert(!isInSchedulingRegion(SD) &&
"new ScheduleData already in scheduling region");
SD->init(SchedulingRegionID);
if (I->mayReadOrWriteMemory()) {
// Update the linked list of memory accessing instructions.
if (CurrentLoadStore) {
CurrentLoadStore->NextLoadStore = SD;
} else {
FirstLoadStoreInRegion = SD;
}
CurrentLoadStore = SD;
}
}
if (NextLoadStore) {
if (CurrentLoadStore)
CurrentLoadStore->NextLoadStore = NextLoadStore;
} else {
LastLoadStoreInRegion = CurrentLoadStore;
}
}
void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
bool InsertInReadyList,
BoUpSLP *SLP) {
assert(SD->isSchedulingEntity());
SmallVector<ScheduleData *, 10> WorkList;
WorkList.push_back(SD);
while (!WorkList.empty()) {
ScheduleData *SD = WorkList.back();
WorkList.pop_back();
ScheduleData *BundleMember = SD;
while (BundleMember) {
assert(isInSchedulingRegion(BundleMember));
if (!BundleMember->hasValidDependencies()) {
DEBUG(dbgs() << "SLP: update deps of " << *BundleMember << "\n");
BundleMember->Dependencies = 0;
BundleMember->resetUnscheduledDeps();
// Handle def-use chain dependencies.
for (User *U : BundleMember->Inst->users()) {
if (isa<Instruction>(U)) {
ScheduleData *UseSD = getScheduleData(U);
if (UseSD && isInSchedulingRegion(UseSD->FirstInBundle)) {
BundleMember->Dependencies++;
ScheduleData *DestBundle = UseSD->FirstInBundle;
if (!DestBundle->IsScheduled) {
BundleMember->incrementUnscheduledDeps(1);
}
if (!DestBundle->hasValidDependencies()) {
WorkList.push_back(DestBundle);
}
}
} else {
// I'm not sure if this can ever happen. But we need to be safe.
// This lets the instruction/bundle never be scheduled and
// eventually disable vectorization.
BundleMember->Dependencies++;
BundleMember->incrementUnscheduledDeps(1);
}
}
// Handle the memory dependencies.
ScheduleData *DepDest = BundleMember->NextLoadStore;
if (DepDest) {
Instruction *SrcInst = BundleMember->Inst;
MemoryLocation SrcLoc = getLocation(SrcInst, SLP->AA);
bool SrcMayWrite = BundleMember->Inst->mayWriteToMemory();
unsigned numAliased = 0;
unsigned DistToSrc = 1;
while (DepDest) {
assert(isInSchedulingRegion(DepDest));
// We have two limits to reduce the complexity:
// 1) AliasedCheckLimit: It's a small limit to reduce calls to
// SLP->isAliased (which is the expensive part in this loop).
// 2) MaxMemDepDistance: It's for very large blocks and it aborts
// the whole loop (even if the loop is fast, it's quadratic).
// It's important for the loop break condition (see below) to
// check this limit even between two read-only instructions.
if (DistToSrc >= MaxMemDepDistance ||
((SrcMayWrite || DepDest->Inst->mayWriteToMemory()) &&
(numAliased >= AliasedCheckLimit ||
SLP->isAliased(SrcLoc, SrcInst, DepDest->Inst)))) {
// We increment the counter only if the locations are aliased
// (instead of counting all alias checks). This gives a better
// balance between reduced runtime and accurate dependencies.
numAliased++;
DepDest->MemoryDependencies.push_back(BundleMember);
BundleMember->Dependencies++;
ScheduleData *DestBundle = DepDest->FirstInBundle;
if (!DestBundle->IsScheduled) {
BundleMember->incrementUnscheduledDeps(1);
}
if (!DestBundle->hasValidDependencies()) {
WorkList.push_back(DestBundle);
}
}
DepDest = DepDest->NextLoadStore;
// Example, explaining the loop break condition: Let's assume our
// starting instruction is i0 and MaxMemDepDistance = 3.
//
// +--------v--v--v
// i0,i1,i2,i3,i4,i5,i6,i7,i8
// +--------^--^--^
//
// MaxMemDepDistance let us stop alias-checking at i3 and we add
// dependencies from i0 to i3,i4,.. (even if they are not aliased).
// Previously we already added dependencies from i3 to i6,i7,i8
// (because of MaxMemDepDistance). As we added a dependency from
// i0 to i3, we have transitive dependencies from i0 to i6,i7,i8
// and we can abort this loop at i6.
if (DistToSrc >= 2 * MaxMemDepDistance)
break;
DistToSrc++;
}
}
}
BundleMember = BundleMember->NextInBundle;
}
if (InsertInReadyList && SD->isReady()) {
ReadyInsts.push_back(SD);
DEBUG(dbgs() << "SLP: gets ready on update: " << *SD->Inst << "\n");
}
}
}
void BoUpSLP::BlockScheduling::resetSchedule() {
assert(ScheduleStart &&
"tried to reset schedule on block which has not been scheduled");
for (Instruction *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
ScheduleData *SD = getScheduleData(I);
assert(isInSchedulingRegion(SD));
SD->IsScheduled = false;
SD->resetUnscheduledDeps();
}
ReadyInsts.clear();
}
void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
if (!BS->ScheduleStart)
return;
DEBUG(dbgs() << "SLP: schedule block " << BS->BB->getName() << "\n");
BS->resetSchedule();
// For the real scheduling we use a more sophisticated ready-list: it is
// sorted by the original instruction location. This lets the final schedule
// be as close as possible to the original instruction order.
struct ScheduleDataCompare {
bool operator()(ScheduleData *SD1, ScheduleData *SD2) {
return SD2->SchedulingPriority < SD1->SchedulingPriority;
}
};
std::set<ScheduleData *, ScheduleDataCompare> ReadyInsts;
// Ensure that all dependency data is updated and fill the ready-list with
// initial instructions.
int Idx = 0;
int NumToSchedule = 0;
for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd;
I = I->getNextNode()) {
ScheduleData *SD = BS->getScheduleData(I);
assert(
SD->isPartOfBundle() == (ScalarToTreeEntry.count(SD->Inst) != 0) &&
"scheduler and vectorizer have different opinion on what is a bundle");
SD->FirstInBundle->SchedulingPriority = Idx++;
if (SD->isSchedulingEntity()) {
BS->calculateDependencies(SD, false, this);
NumToSchedule++;
}
}
BS->initialFillReadyList(ReadyInsts);
Instruction *LastScheduledInst = BS->ScheduleEnd;
// Do the "real" scheduling.
while (!ReadyInsts.empty()) {
ScheduleData *picked = *ReadyInsts.begin();
ReadyInsts.erase(ReadyInsts.begin());
// Move the scheduled instruction(s) to their dedicated places, if not
// there yet.
ScheduleData *BundleMember = picked;
while (BundleMember) {
Instruction *pickedInst = BundleMember->Inst;
if (LastScheduledInst->getNextNode() != pickedInst) {
BS->BB->getInstList().remove(pickedInst);
BS->BB->getInstList().insert(LastScheduledInst->getIterator(),
pickedInst);
}
LastScheduledInst = pickedInst;
BundleMember = BundleMember->NextInBundle;
}
BS->schedule(picked, ReadyInsts);
NumToSchedule--;
}
assert(NumToSchedule == 0 && "could not schedule all instructions");
// Avoid duplicate scheduling of the block.
BS->ScheduleStart = nullptr;
}
unsigned BoUpSLP::getVectorElementSize(Value *V) {
// If V is a store, just return the width of the stored value without
// traversing the expression tree. This is the common case.
if (auto *Store = dyn_cast<StoreInst>(V))
return DL->getTypeSizeInBits(Store->getValueOperand()->getType());
// If V is not a store, we can traverse the expression tree to find loads
// that feed it. The type of the loaded value may indicate a more suitable
// width than V's type. We want to base the vector element size on the width
// of memory operations where possible.
SmallVector<Instruction *, 16> Worklist;
SmallPtrSet<Instruction *, 16> Visited;
if (auto *I = dyn_cast<Instruction>(V))
Worklist.push_back(I);
// Traverse the expression tree in bottom-up order looking for loads. If we
// encounter an instruciton we don't yet handle, we give up.
auto MaxWidth = 0u;
auto FoundUnknownInst = false;
while (!Worklist.empty() && !FoundUnknownInst) {
auto *I = Worklist.pop_back_val();
Visited.insert(I);
// We should only be looking at scalar instructions here. If the current
// instruction has a vector type, give up.
auto *Ty = I->getType();
if (isa<VectorType>(Ty))
FoundUnknownInst = true;
// If the current instruction is a load, update MaxWidth to reflect the
// width of the loaded value.
else if (isa<LoadInst>(I))
MaxWidth = std::max<unsigned>(MaxWidth, DL->getTypeSizeInBits(Ty));
// Otherwise, we need to visit the operands of the instruction. We only
// handle the interesting cases from buildTree here. If an operand is an
// instruction we haven't yet visited, we add it to the worklist.
else if (isa<PHINode>(I) || isa<CastInst>(I) || isa<GetElementPtrInst>(I) ||
isa<CmpInst>(I) || isa<SelectInst>(I) || isa<BinaryOperator>(I)) {
for (Use &U : I->operands())
if (auto *J = dyn_cast<Instruction>(U.get()))
if (!Visited.count(J))
Worklist.push_back(J);
}
// If we don't yet handle the instruction, give up.
else
FoundUnknownInst = true;
}
// If we didn't encounter a memory access in the expression tree, or if we
// gave up for some reason, just return the width of V.
if (!MaxWidth || FoundUnknownInst)
return DL->getTypeSizeInBits(V->getType());
// Otherwise, return the maximum width we found.
return MaxWidth;
}
// Determine if a value V in a vectorizable expression Expr can be demoted to a
// smaller type with a truncation. We collect the values that will be demoted
// in ToDemote and additional roots that require investigating in Roots.
static bool collectValuesToDemote(Value *V, SmallPtrSetImpl<Value *> &Expr,
SmallVectorImpl<Value *> &ToDemote,
SmallVectorImpl<Value *> &Roots) {
// We can always demote constants.
if (isa<Constant>(V)) {
ToDemote.push_back(V);
return true;
}
// If the value is not an instruction in the expression with only one use, it
// cannot be demoted.
auto *I = dyn_cast<Instruction>(V);
if (!I || !I->hasOneUse() || !Expr.count(I))
return false;
switch (I->getOpcode()) {
// We can always demote truncations and extensions. Since truncations can
// seed additional demotion, we save the truncated value.
case Instruction::Trunc:
Roots.push_back(I->getOperand(0));
case Instruction::ZExt:
case Instruction::SExt:
break;
// We can demote certain binary operations if we can demote both of their
// operands.
case Instruction::Add:
case Instruction::Sub:
case Instruction::Mul:
case Instruction::And:
case Instruction::Or:
case Instruction::Xor:
if (!collectValuesToDemote(I->getOperand(0), Expr, ToDemote, Roots) ||
!collectValuesToDemote(I->getOperand(1), Expr, ToDemote, Roots))
return false;
break;
// We can demote selects if we can demote their true and false values.
case Instruction::Select: {
SelectInst *SI = cast<SelectInst>(I);
if (!collectValuesToDemote(SI->getTrueValue(), Expr, ToDemote, Roots) ||
!collectValuesToDemote(SI->getFalseValue(), Expr, ToDemote, Roots))
return false;
break;
}
// We can demote phis if we can demote all their incoming operands. Note that
// we don't need to worry about cycles since we ensure single use above.
case Instruction::PHI: {
PHINode *PN = cast<PHINode>(I);
for (Value *IncValue : PN->incoming_values())
if (!collectValuesToDemote(IncValue, Expr, ToDemote, Roots))
return false;
break;
}
// Otherwise, conservatively give up.
default:
return false;
}
// Record the value that we can demote.
ToDemote.push_back(V);
return true;
}
void BoUpSLP::computeMinimumValueSizes() {
// If there are no external uses, the expression tree must be rooted by a
// store. We can't demote in-memory values, so there is nothing to do here.
if (ExternalUses.empty())
return;
// We only attempt to truncate integer expressions.
auto &TreeRoot = VectorizableTree[0].Scalars;
auto *TreeRootIT = dyn_cast<IntegerType>(TreeRoot[0]->getType());
if (!TreeRootIT)
return;
// If the expression is not rooted by a store, these roots should have
// external uses. We will rely on InstCombine to rewrite the expression in
// the narrower type. However, InstCombine only rewrites single-use values.
// This means that if a tree entry other than a root is used externally, it
// must have multiple uses and InstCombine will not rewrite it. The code
// below ensures that only the roots are used externally.
SmallPtrSet<Value *, 32> Expr(TreeRoot.begin(), TreeRoot.end());
for (auto &EU : ExternalUses)
if (!Expr.erase(EU.Scalar))
return;
if (!Expr.empty())
return;
// Collect the scalar values of the vectorizable expression. We will use this
// context to determine which values can be demoted. If we see a truncation,
// we mark it as seeding another demotion.
for (auto &Entry : VectorizableTree)
Expr.insert(Entry.Scalars.begin(), Entry.Scalars.end());
// Ensure the roots of the vectorizable tree don't form a cycle. They must
// have a single external user that is not in the vectorizable tree.
for (auto *Root : TreeRoot)
if (!Root->hasOneUse() || Expr.count(*Root->user_begin()))
return;
// Conservatively determine if we can actually truncate the roots of the
// expression. Collect the values that can be demoted in ToDemote and
// additional roots that require investigating in Roots.
SmallVector<Value *, 32> ToDemote;
SmallVector<Value *, 4> Roots;
for (auto *Root : TreeRoot)
if (!collectValuesToDemote(Root, Expr, ToDemote, Roots))
return;
// The maximum bit width required to represent all the values that can be
// demoted without loss of precision. It would be safe to truncate the roots
// of the expression to this width.
auto MaxBitWidth = 8u;
// We first check if all the bits of the roots are demanded. If they're not,
// we can truncate the roots to this narrower type.
for (auto *Root : TreeRoot) {
auto Mask = DB->getDemandedBits(cast<Instruction>(Root));
MaxBitWidth = std::max<unsigned>(
Mask.getBitWidth() - Mask.countLeadingZeros(), MaxBitWidth);
}
// True if the roots can be zero-extended back to their original type, rather
// than sign-extended. We know that if the leading bits are not demanded, we
// can safely zero-extend. So we initialize IsKnownPositive to True.
bool IsKnownPositive = true;
// If all the bits of the roots are demanded, we can try a little harder to
// compute a narrower type. This can happen, for example, if the roots are
// getelementptr indices. InstCombine promotes these indices to the pointer
// width. Thus, all their bits are technically demanded even though the
// address computation might be vectorized in a smaller type.
//
// We start by looking at each entry that can be demoted. We compute the
// maximum bit width required to store the scalar by using ValueTracking to
// compute the number of high-order bits we can truncate.
if (MaxBitWidth == DL->getTypeSizeInBits(TreeRoot[0]->getType())) {
MaxBitWidth = 8u;
// Determine if the sign bit of all the roots is known to be zero. If not,
// IsKnownPositive is set to False.
IsKnownPositive = all_of(TreeRoot, [&](Value *R) {
bool KnownZero = false;
bool KnownOne = false;
ComputeSignBit(R, KnownZero, KnownOne, *DL);
return KnownZero;
});
// Determine the maximum number of bits required to store the scalar
// values.
for (auto *Scalar : ToDemote) {
auto NumSignBits = ComputeNumSignBits(Scalar, *DL, 0, AC, 0, DT);
auto NumTypeBits = DL->getTypeSizeInBits(Scalar->getType());
MaxBitWidth = std::max<unsigned>(NumTypeBits - NumSignBits, MaxBitWidth);
}
// If we can't prove that the sign bit is zero, we must add one to the
// maximum bit width to account for the unknown sign bit. This preserves
// the existing sign bit so we can safely sign-extend the root back to the
// original type. Otherwise, if we know the sign bit is zero, we will
// zero-extend the root instead.
//
// FIXME: This is somewhat suboptimal, as there will be cases where adding
// one to the maximum bit width will yield a larger-than-necessary
// type. In general, we need to add an extra bit only if we can't
// prove that the upper bit of the original type is equal to the
// upper bit of the proposed smaller type. If these two bits are the
// same (either zero or one) we know that sign-extending from the
// smaller type will result in the same value. Here, since we can't
// yet prove this, we are just making the proposed smaller type
// larger to ensure correctness.
if (!IsKnownPositive)
++MaxBitWidth;
}
// Round MaxBitWidth up to the next power-of-two.
if (!isPowerOf2_64(MaxBitWidth))
MaxBitWidth = NextPowerOf2(MaxBitWidth);
// If the maximum bit width we compute is less than the with of the roots'
// type, we can proceed with the narrowing. Otherwise, do nothing.
if (MaxBitWidth >= TreeRootIT->getBitWidth())
return;
// If we can truncate the root, we must collect additional values that might
// be demoted as a result. That is, those seeded by truncations we will
// modify.
while (!Roots.empty())
collectValuesToDemote(Roots.pop_back_val(), Expr, ToDemote, Roots);
// Finally, map the values we can demote to the maximum bit with we computed.
for (auto *Scalar : ToDemote)
MinBWs[Scalar] = std::make_pair(MaxBitWidth, !IsKnownPositive);
}
namespace {
/// The SLPVectorizer Pass.
struct SLPVectorizer : public FunctionPass {
SLPVectorizerPass Impl;
/// Pass identification, replacement for typeid
static char ID;
explicit SLPVectorizer() : FunctionPass(ID) {
initializeSLPVectorizerPass(*PassRegistry::getPassRegistry());
}
bool doInitialization(Module &M) override {
return false;
}
bool runOnFunction(Function &F) override {
if (skipFunction(F))
return false;
auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
auto *TLI = TLIP ? &TLIP->getTLI() : nullptr;
auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
return Impl.runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB);
}
void getAnalysisUsage(AnalysisUsage &AU) const override {
FunctionPass::getAnalysisUsage(AU);
AU.addRequired<AssumptionCacheTracker>();
AU.addRequired<ScalarEvolutionWrapperPass>();
AU.addRequired<AAResultsWrapperPass>();
AU.addRequired<TargetTransformInfoWrapperPass>();
AU.addRequired<LoopInfoWrapperPass>();
AU.addRequired<DominatorTreeWrapperPass>();
AU.addRequired<DemandedBitsWrapperPass>();
AU.addPreserved<LoopInfoWrapperPass>();
AU.addPreserved<DominatorTreeWrapperPass>();
AU.addPreserved<AAResultsWrapperPass>();
AU.addPreserved<GlobalsAAWrapperPass>();
AU.setPreservesCFG();
}
};
} // end anonymous namespace
PreservedAnalyses SLPVectorizerPass::run(Function &F, FunctionAnalysisManager &AM) {
auto *SE = &AM.getResult<ScalarEvolutionAnalysis>(F);
auto *TTI = &AM.getResult<TargetIRAnalysis>(F);
auto *TLI = AM.getCachedResult<TargetLibraryAnalysis>(F);
auto *AA = &AM.getResult<AAManager>(F);
auto *LI = &AM.getResult<LoopAnalysis>(F);
auto *DT = &AM.getResult<DominatorTreeAnalysis>(F);
auto *AC = &AM.getResult<AssumptionAnalysis>(F);
auto *DB = &AM.getResult<DemandedBitsAnalysis>(F);
bool Changed = runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB);
if (!Changed)
return PreservedAnalyses::all();
PreservedAnalyses PA;
PA.preserve<LoopAnalysis>();
PA.preserve<DominatorTreeAnalysis>();
PA.preserve<AAManager>();
PA.preserve<GlobalsAA>();
return PA;
}
bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_,
TargetTransformInfo *TTI_,
TargetLibraryInfo *TLI_, AliasAnalysis *AA_,
LoopInfo *LI_, DominatorTree *DT_,
AssumptionCache *AC_, DemandedBits *DB_) {
SE = SE_;
TTI = TTI_;
TLI = TLI_;
AA = AA_;
LI = LI_;
DT = DT_;
AC = AC_;
DB = DB_;
DL = &F.getParent()->getDataLayout();
Stores.clear();
GEPs.clear();
bool Changed = false;
// If the target claims to have no vector registers don't attempt
// vectorization.
if (!TTI->getNumberOfRegisters(true))
return false;
// Don't vectorize when the attribute NoImplicitFloat is used.
if (F.hasFnAttribute(Attribute::NoImplicitFloat))
return false;
DEBUG(dbgs() << "SLP: Analyzing blocks in " << F.getName() << ".\n");
// Use the bottom up slp vectorizer to construct chains that start with
// store instructions.
BoUpSLP R(&F, SE, TTI, TLI, AA, LI, DT, AC, DB, DL);
// A general note: the vectorizer must use BoUpSLP::eraseInstruction() to
// delete instructions.
// Scan the blocks in the function in post order.
for (auto BB : post_order(&F.getEntryBlock())) {
collectSeedInstructions(BB);
// Vectorize trees that end at stores.
if (!Stores.empty()) {
DEBUG(dbgs() << "SLP: Found stores for " << Stores.size()
<< " underlying objects.\n");
Changed |= vectorizeStoreChains(R);
}
// Vectorize trees that end at reductions.
Changed |= vectorizeChainsInBlock(BB, R);
// Vectorize the index computations of getelementptr instructions. This
// is primarily intended to catch gather-like idioms ending at
// non-consecutive loads.
if (!GEPs.empty()) {
DEBUG(dbgs() << "SLP: Found GEPs for " << GEPs.size()
<< " underlying objects.\n");
Changed |= vectorizeGEPIndices(BB, R);
}
}
if (Changed) {
R.optimizeGatherSequence();
DEBUG(dbgs() << "SLP: vectorized \"" << F.getName() << "\"\n");
DEBUG(verifyFunction(F));
}
return Changed;
}
/// \brief Check that the Values in the slice in VL array are still existent in
/// the WeakVH array.
/// Vectorization of part of the VL array may cause later values in the VL array
/// to become invalid. We track when this has happened in the WeakVH array.
static bool hasValueBeenRAUWed(ArrayRef<Value *> VL, ArrayRef<WeakVH> VH,
unsigned SliceBegin, unsigned SliceSize) {
VL = VL.slice(SliceBegin, SliceSize);
VH = VH.slice(SliceBegin, SliceSize);
return !std::equal(VL.begin(), VL.end(), VH.begin());
}
bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
unsigned VecRegSize) {
unsigned ChainLen = Chain.size();
DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << ChainLen
<< "\n");
unsigned Sz = R.getVectorElementSize(Chain[0]);
unsigned VF = VecRegSize / Sz;
if (!isPowerOf2_32(Sz) || VF < 2)
return false;
// Keep track of values that were deleted by vectorizing in the loop below.
SmallVector<WeakVH, 8> TrackValues(Chain.begin(), Chain.end());
bool Changed = false;
// Look for profitable vectorizable trees at all offsets, starting at zero.
for (unsigned i = 0, e = ChainLen; i < e; ++i) {
if (i + VF > e)
break;
// Check that a previous iteration of this loop did not delete the Value.
if (hasValueBeenRAUWed(Chain, TrackValues, i, VF))
continue;
DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << i
<< "\n");
ArrayRef<Value *> Operands = Chain.slice(i, VF);
R.buildTree(Operands);
if (R.isTreeTinyAndNotFullyVectorizable())
continue;
R.computeMinimumValueSizes();
int Cost = R.getTreeCost();
DEBUG(dbgs() << "SLP: Found cost=" << Cost << " for VF=" << VF << "\n");
if (Cost < -SLPCostThreshold) {
DEBUG(dbgs() << "SLP: Decided to vectorize cost=" << Cost << "\n");
R.vectorizeTree();
// Move to the next bundle.
i += VF - 1;
Changed = true;
}
}
return Changed;
}
bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores,
BoUpSLP &R) {
SetVector<StoreInst *> Heads, Tails;
SmallDenseMap<StoreInst *, StoreInst *> ConsecutiveChain;
// We may run into multiple chains that merge into a single chain. We mark the
// stores that we vectorized so that we don't visit the same store twice.
BoUpSLP::ValueSet VectorizedStores;
bool Changed = false;
// Do a quadratic search on all of the given stores and find
// all of the pairs of stores that follow each other.
SmallVector<unsigned, 16> IndexQueue;
for (unsigned i = 0, e = Stores.size(); i < e; ++i) {
IndexQueue.clear();
// If a store has multiple consecutive store candidates, search Stores
// array according to the sequence: from i+1 to e, then from i-1 to 0.
// This is because usually pairing with immediate succeeding or preceding
// candidate create the best chance to find slp vectorization opportunity.
unsigned j = 0;
for (j = i + 1; j < e; ++j)
IndexQueue.push_back(j);
for (j = i; j > 0; --j)
IndexQueue.push_back(j - 1);
for (auto &k : IndexQueue) {
if (isConsecutiveAccess(Stores[i], Stores[k], *DL, *SE)) {
Tails.insert(Stores[k]);
Heads.insert(Stores[i]);
ConsecutiveChain[Stores[i]] = Stores[k];
break;
}
}
}
// For stores that start but don't end a link in the chain:
for (SetVector<StoreInst *>::iterator it = Heads.begin(), e = Heads.end();
it != e; ++it) {
if (Tails.count(*it))
continue;
// We found a store instr that starts a chain. Now follow the chain and try
// to vectorize it.
BoUpSLP::ValueList Operands;
StoreInst *I = *it;
// Collect the chain into a list.
while (Tails.count(I) || Heads.count(I)) {
if (VectorizedStores.count(I))
break;
Operands.push_back(I);
// Move to the next value in the chain.
I = ConsecutiveChain[I];
}
// FIXME: Is division-by-2 the correct step? Should we assert that the
// register size is a power-of-2?
for (unsigned Size = R.getMaxVecRegSize(); Size >= R.getMinVecRegSize();
Size /= 2) {
if (vectorizeStoreChain(Operands, R, Size)) {
// Mark the vectorized stores so that we don't vectorize them again.
VectorizedStores.insert(Operands.begin(), Operands.end());
Changed = true;
break;
}
}
}
return Changed;
}
void SLPVectorizerPass::collectSeedInstructions(BasicBlock *BB) {
// Initialize the collections. We will make a single pass over the block.
Stores.clear();
GEPs.clear();
// Visit the store and getelementptr instructions in BB and organize them in
// Stores and GEPs according to the underlying objects of their pointer
// operands.
for (Instruction &I : *BB) {
// Ignore store instructions that are volatile or have a pointer operand
// that doesn't point to a scalar type.
if (auto *SI = dyn_cast<StoreInst>(&I)) {
if (!SI->isSimple())
continue;
if (!isValidElementType(SI->getValueOperand()->getType()))
continue;
Stores[GetUnderlyingObject(SI->getPointerOperand(), *DL)].push_back(SI);
}
// Ignore getelementptr instructions that have more than one index, a
// constant index, or a pointer operand that doesn't point to a scalar
// type.
else if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {
auto Idx = GEP->idx_begin()->get();
if (GEP->getNumIndices() > 1 || isa<Constant>(Idx))
continue;
if (!isValidElementType(Idx->getType()))
continue;
if (GEP->getType()->isVectorTy())
continue;
GEPs[GetUnderlyingObject(GEP->getPointerOperand(), *DL)].push_back(GEP);
}
}
}
bool SLPVectorizerPass::tryToVectorizePair(Value *A, Value *B, BoUpSLP &R) {
if (!A || !B)
return false;
Value *VL[] = { A, B };
return tryToVectorizeList(VL, R, None, true);
}
bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
ArrayRef<Value *> BuildVector,
bool AllowReorder) {
if (VL.size() < 2)
return false;
DEBUG(dbgs() << "SLP: Trying to vectorize a list of length = " << VL.size()
<< ".\n");
// Check that all of the parts are scalar instructions of the same type.
Instruction *I0 = dyn_cast<Instruction>(VL[0]);
if (!I0)
return false;
unsigned Opcode0 = I0->getOpcode();
unsigned Sz = R.getVectorElementSize(I0);
unsigned MinVF = std::max(2U, R.getMinVecRegSize() / Sz);
unsigned MaxVF = std::max<unsigned>(PowerOf2Floor(VL.size()), MinVF);
if (MaxVF < 2)
return false;
for (Value *V : VL) {
Type *Ty = V->getType();
if (!isValidElementType(Ty))
return false;
Instruction *Inst = dyn_cast<Instruction>(V);
if (!Inst || Inst->getOpcode() != Opcode0)
return false;
}
bool Changed = false;
// Keep track of values that were deleted by vectorizing in the loop below.
SmallVector<WeakVH, 8> TrackValues(VL.begin(), VL.end());
unsigned NextInst = 0, MaxInst = VL.size();
for (unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF;
VF /= 2) {
// No actual vectorization should happen, if number of parts is the same as
// provided vectorization factor (i.e. the scalar type is used for vector
// code during codegen).
auto *VecTy = VectorType::get(VL[0]->getType(), VF);
if (TTI->getNumberOfParts(VecTy) == VF)
continue;
for (unsigned I = NextInst; I < MaxInst; ++I) {
unsigned OpsWidth = 0;
if (I + VF > MaxInst)
OpsWidth = MaxInst - I;
else
OpsWidth = VF;
if (!isPowerOf2_32(OpsWidth) || OpsWidth < 2)
break;
// Check that a previous iteration of this loop did not delete the Value.
if (hasValueBeenRAUWed(VL, TrackValues, I, OpsWidth))
continue;
DEBUG(dbgs() << "SLP: Analyzing " << OpsWidth << " operations "
<< "\n");
ArrayRef<Value *> Ops = VL.slice(I, OpsWidth);
ArrayRef<Value *> BuildVectorSlice;
if (!BuildVector.empty())
BuildVectorSlice = BuildVector.slice(I, OpsWidth);
R.buildTree(Ops, BuildVectorSlice);
// TODO: check if we can allow reordering for more cases.
if (AllowReorder && R.shouldReorder()) {
// Conceptually, there is nothing actually preventing us from trying to
// reorder a larger list. In fact, we do exactly this when vectorizing
// reductions. However, at this point, we only expect to get here from
// tryToVectorizePair().
assert(Ops.size() == 2);
assert(BuildVectorSlice.empty());
Value *ReorderedOps[] = {Ops[1], Ops[0]};
R.buildTree(ReorderedOps, None);
}
if (R.isTreeTinyAndNotFullyVectorizable())
continue;
R.computeMinimumValueSizes();
int Cost = R.getTreeCost();
if (Cost < -SLPCostThreshold) {
DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n");
Value *VectorizedRoot = R.vectorizeTree();
// Reconstruct the build vector by extracting the vectorized root. This
// way we handle the case where some elements of the vector are
// undefined.
// (return (inserelt <4 xi32> (insertelt undef (opd0) 0) (opd1) 2))
if (!BuildVectorSlice.empty()) {
// The insert point is the last build vector instruction. The
// vectorized root will precede it. This guarantees that we get an
// instruction. The vectorized tree could have been constant folded.
Instruction *InsertAfter = cast<Instruction>(BuildVectorSlice.back());
unsigned VecIdx = 0;
for (auto &V : BuildVectorSlice) {
IRBuilder<NoFolder> Builder(InsertAfter->getParent(),
++BasicBlock::iterator(InsertAfter));
Instruction *I = cast<Instruction>(V);
assert(isa<InsertElementInst>(I) || isa<InsertValueInst>(I));
Instruction *Extract =
cast<Instruction>(Builder.CreateExtractElement(
VectorizedRoot, Builder.getInt32(VecIdx++)));
I->setOperand(1, Extract);
I->removeFromParent();
I->insertAfter(Extract);
InsertAfter = I;
}
}
// Move to the next bundle.
I += VF - 1;
NextInst = I + 1;
Changed = true;
}
}
}
return Changed;
}
bool SLPVectorizerPass::tryToVectorize(BinaryOperator *V, BoUpSLP &R) {
if (!V)
return false;
- Value *P = V->getParent();
-
- // Vectorize in current basic block only.
- auto *Op0 = dyn_cast<Instruction>(V->getOperand(0));
- auto *Op1 = dyn_cast<Instruction>(V->getOperand(1));
- if (!Op0 || !Op1 || Op0->getParent() != P || Op1->getParent() != P)
- return false;
-
// Try to vectorize V.
- if (tryToVectorizePair(Op0, Op1, R))
+ if (tryToVectorizePair(V->getOperand(0), V->getOperand(1), R))
return true;
- auto *A = dyn_cast<BinaryOperator>(Op0);
- auto *B = dyn_cast<BinaryOperator>(Op1);
+ BinaryOperator *A = dyn_cast<BinaryOperator>(V->getOperand(0));
+ BinaryOperator *B = dyn_cast<BinaryOperator>(V->getOperand(1));
// Try to skip B.
if (B && B->hasOneUse()) {
- auto *B0 = dyn_cast<BinaryOperator>(B->getOperand(0));
- auto *B1 = dyn_cast<BinaryOperator>(B->getOperand(1));
- if (B0 && B0->getParent() == P && tryToVectorizePair(A, B0, R))
+ BinaryOperator *B0 = dyn_cast<BinaryOperator>(B->getOperand(0));
+ BinaryOperator *B1 = dyn_cast<BinaryOperator>(B->getOperand(1));
+ if (tryToVectorizePair(A, B0, R)) {
return true;
- if (B1 && B1->getParent() == P && tryToVectorizePair(A, B1, R))
+ }
+ if (tryToVectorizePair(A, B1, R)) {
return true;
+ }
}
// Try to skip A.
if (A && A->hasOneUse()) {
- auto *A0 = dyn_cast<BinaryOperator>(A->getOperand(0));
- auto *A1 = dyn_cast<BinaryOperator>(A->getOperand(1));
- if (A0 && A0->getParent() == P && tryToVectorizePair(A0, B, R))
+ BinaryOperator *A0 = dyn_cast<BinaryOperator>(A->getOperand(0));
+ BinaryOperator *A1 = dyn_cast<BinaryOperator>(A->getOperand(1));
+ if (tryToVectorizePair(A0, B, R)) {
return true;
- if (A1 && A1->getParent() == P && tryToVectorizePair(A1, B, R))
+ }
+ if (tryToVectorizePair(A1, B, R)) {
return true;
+ }
}
- return false;
+ return 0;
}
/// \brief Generate a shuffle mask to be used in a reduction tree.
///
/// \param VecLen The length of the vector to be reduced.
/// \param NumEltsToRdx The number of elements that should be reduced in the
/// vector.
/// \param IsPairwise Whether the reduction is a pairwise or splitting
/// reduction. A pairwise reduction will generate a mask of
/// <0,2,...> or <1,3,..> while a splitting reduction will generate
/// <2,3, undef,undef> for a vector of 4 and NumElts = 2.
/// \param IsLeft True will generate a mask of even elements, odd otherwise.
static Value *createRdxShuffleMask(unsigned VecLen, unsigned NumEltsToRdx,
bool IsPairwise, bool IsLeft,
IRBuilder<> &Builder) {
assert((IsPairwise || !IsLeft) && "Don't support a <0,1,undef,...> mask");
SmallVector<Constant *, 32> ShuffleMask(
VecLen, UndefValue::get(Builder.getInt32Ty()));
if (IsPairwise)
// Build a mask of 0, 2, ... (left) or 1, 3, ... (right).
for (unsigned i = 0; i != NumEltsToRdx; ++i)
ShuffleMask[i] = Builder.getInt32(2 * i + !IsLeft);
else
// Move the upper half of the vector to the lower half.
for (unsigned i = 0; i != NumEltsToRdx; ++i)
ShuffleMask[i] = Builder.getInt32(NumEltsToRdx + i);
return ConstantVector::get(ShuffleMask);
}
namespace {
/// Model horizontal reductions.
///
/// A horizontal reduction is a tree of reduction operations (currently add and
/// fadd) that has operations that can be put into a vector as its leaf.
/// For example, this tree:
///
/// mul mul mul mul
/// \ / \ /
/// + +
/// \ /
/// +
/// This tree has "mul" as its reduced values and "+" as its reduction
/// operations. A reduction might be feeding into a store or a binary operation
/// feeding a phi.
/// ...
/// \ /
/// +
/// |
/// phi +=
///
/// Or:
/// ...
/// \ /
/// +
/// |
/// *p =
///
class HorizontalReduction {
SmallVector<Value *, 16> ReductionOps;
SmallVector<Value *, 32> ReducedVals;
BinaryOperator *ReductionRoot;
// After successfull horizontal reduction vectorization attempt for PHI node
// vectorizer tries to update root binary op by combining vectorized tree and
// the ReductionPHI node. But during vectorization this ReductionPHI can be
// vectorized itself and replaced by the undef value, while the instruction
// itself is marked for deletion. This 'marked for deletion' PHI node then can
// be used in new binary operation, causing "Use still stuck around after Def
// is destroyed" crash upon PHI node deletion.
WeakVH ReductionPHI;
/// The opcode of the reduction.
unsigned ReductionOpcode;
/// The opcode of the values we perform a reduction on.
unsigned ReducedValueOpcode;
/// Should we model this reduction as a pairwise reduction tree or a tree that
/// splits the vector in halves and adds those halves.
bool IsPairwiseReduction;
public:
/// The width of one full horizontal reduction operation.
unsigned ReduxWidth;
/// Minimal width of available vector registers. It's used to determine
/// ReduxWidth.
unsigned MinVecRegSize;
HorizontalReduction(unsigned MinVecRegSize)
: ReductionRoot(nullptr), ReductionOpcode(0), ReducedValueOpcode(0),
IsPairwiseReduction(false), ReduxWidth(0),
MinVecRegSize(MinVecRegSize) {}
/// \brief Try to find a reduction tree.
bool matchAssociativeReduction(PHINode *Phi, BinaryOperator *B) {
assert((!Phi || is_contained(Phi->operands(), B)) &&
"Thi phi needs to use the binary operator");
// We could have a initial reductions that is not an add.
// r *= v1 + v2 + v3 + v4
// In such a case start looking for a tree rooted in the first '+'.
if (Phi) {
if (B->getOperand(0) == Phi) {
Phi = nullptr;
B = dyn_cast<BinaryOperator>(B->getOperand(1));
} else if (B->getOperand(1) == Phi) {
Phi = nullptr;
B = dyn_cast<BinaryOperator>(B->getOperand(0));
}
}
if (!B)
return false;
Type *Ty = B->getType();
if (!isValidElementType(Ty))
return false;
const DataLayout &DL = B->getModule()->getDataLayout();
ReductionOpcode = B->getOpcode();
ReducedValueOpcode = 0;
// FIXME: Register size should be a parameter to this function, so we can
// try different vectorization factors.
ReduxWidth = MinVecRegSize / DL.getTypeSizeInBits(Ty);
ReductionRoot = B;
ReductionPHI = Phi;
if (ReduxWidth < 4)
return false;
// We currently only support adds.
if (ReductionOpcode != Instruction::Add &&
ReductionOpcode != Instruction::FAdd)
return false;
// Post order traverse the reduction tree starting at B. We only handle true
// trees containing only binary operators or selects.
SmallVector<std::pair<Instruction *, unsigned>, 32> Stack;
Stack.push_back(std::make_pair(B, 0));
while (!Stack.empty()) {
Instruction *TreeN = Stack.back().first;
unsigned EdgeToVist = Stack.back().second++;
bool IsReducedValue = TreeN->getOpcode() != ReductionOpcode;
// Only handle trees in the current basic block.
if (TreeN->getParent() != B->getParent())
return false;
// Each tree node needs to have one user except for the ultimate
// reduction.
if (!TreeN->hasOneUse() && TreeN != B)
return false;
// Postorder vist.
if (EdgeToVist == 2 || IsReducedValue) {
if (IsReducedValue) {
// Make sure that the opcodes of the operations that we are going to
// reduce match.
if (!ReducedValueOpcode)
ReducedValueOpcode = TreeN->getOpcode();
else if (ReducedValueOpcode != TreeN->getOpcode())
return false;
ReducedVals.push_back(TreeN);
} else {
// We need to be able to reassociate the adds.
if (!TreeN->isAssociative())
return false;
ReductionOps.push_back(TreeN);
}
// Retract.
Stack.pop_back();
continue;
}
// Visit left or right.
Value *NextV = TreeN->getOperand(EdgeToVist);
if (NextV != Phi) {
auto *I = dyn_cast<Instruction>(NextV);
// Continue analysis if the next operand is a reduction operation or
// (possibly) a reduced value. If the reduced value opcode is not set,
// the first met operation != reduction operation is considered as the
// reduced value class.
if (I && (!ReducedValueOpcode || I->getOpcode() == ReducedValueOpcode ||
I->getOpcode() == ReductionOpcode)) {
if (!ReducedValueOpcode && I->getOpcode() != ReductionOpcode)
ReducedValueOpcode = I->getOpcode();
Stack.push_back(std::make_pair(I, 0));
continue;
}
return false;
}
}
return true;
}
/// \brief Attempt to vectorize the tree found by
/// matchAssociativeReduction.
bool tryToReduce(BoUpSLP &V, TargetTransformInfo *TTI) {
if (ReducedVals.empty())
return false;
unsigned NumReducedVals = ReducedVals.size();
if (NumReducedVals < ReduxWidth)
return false;
Value *VectorizedTree = nullptr;
IRBuilder<> Builder(ReductionRoot);
FastMathFlags Unsafe;
Unsafe.setUnsafeAlgebra();
Builder.setFastMathFlags(Unsafe);
unsigned i = 0;
for (; i < NumReducedVals - ReduxWidth + 1; i += ReduxWidth) {
auto VL = makeArrayRef(&ReducedVals[i], ReduxWidth);
V.buildTree(VL, ReductionOps);
if (V.shouldReorder()) {
SmallVector<Value *, 8> Reversed(VL.rbegin(), VL.rend());
V.buildTree(Reversed, ReductionOps);
}
if (V.isTreeTinyAndNotFullyVectorizable())
continue;
V.computeMinimumValueSizes();
// Estimate cost.
int Cost = V.getTreeCost() + getReductionCost(TTI, ReducedVals[i]);
if (Cost >= -SLPCostThreshold)
break;
DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:" << Cost
<< ". (HorRdx)\n");
// Vectorize a tree.
DebugLoc Loc = cast<Instruction>(ReducedVals[i])->getDebugLoc();
Value *VectorizedRoot = V.vectorizeTree();
// Emit a reduction.
Value *ReducedSubTree = emitReduction(VectorizedRoot, Builder);
if (VectorizedTree) {
Builder.SetCurrentDebugLocation(Loc);
VectorizedTree = createBinOp(Builder, ReductionOpcode, VectorizedTree,
ReducedSubTree, "bin.rdx");
} else
VectorizedTree = ReducedSubTree;
}
if (VectorizedTree) {
// Finish the reduction.
for (; i < NumReducedVals; ++i) {
Builder.SetCurrentDebugLocation(
cast<Instruction>(ReducedVals[i])->getDebugLoc());
VectorizedTree = createBinOp(Builder, ReductionOpcode, VectorizedTree,
ReducedVals[i]);
}
// Update users.
if (ReductionPHI && !isa<UndefValue>(ReductionPHI)) {
assert(ReductionRoot && "Need a reduction operation");
ReductionRoot->setOperand(0, VectorizedTree);
ReductionRoot->setOperand(1, ReductionPHI);
} else
ReductionRoot->replaceAllUsesWith(VectorizedTree);
}
return VectorizedTree != nullptr;
}
unsigned numReductionValues() const {
return ReducedVals.size();
}
private:
/// \brief Calculate the cost of a reduction.
int getReductionCost(TargetTransformInfo *TTI, Value *FirstReducedVal) {
Type *ScalarTy = FirstReducedVal->getType();
Type *VecTy = VectorType::get(ScalarTy, ReduxWidth);
int PairwiseRdxCost = TTI->getReductionCost(ReductionOpcode, VecTy, true);
int SplittingRdxCost = TTI->getReductionCost(ReductionOpcode, VecTy, false);
IsPairwiseReduction = PairwiseRdxCost < SplittingRdxCost;
int VecReduxCost = IsPairwiseReduction ? PairwiseRdxCost : SplittingRdxCost;
int ScalarReduxCost =
(ReduxWidth - 1) *
TTI->getArithmeticInstrCost(ReductionOpcode, ScalarTy);
DEBUG(dbgs() << "SLP: Adding cost " << VecReduxCost - ScalarReduxCost
<< " for reduction that starts with " << *FirstReducedVal
<< " (It is a "
<< (IsPairwiseReduction ? "pairwise" : "splitting")
<< " reduction)\n");
return VecReduxCost - ScalarReduxCost;
}
static Value *createBinOp(IRBuilder<> &Builder, unsigned Opcode, Value *L,
Value *R, const Twine &Name = "") {
if (Opcode == Instruction::FAdd)
return Builder.CreateFAdd(L, R, Name);
return Builder.CreateBinOp((Instruction::BinaryOps)Opcode, L, R, Name);
}
/// \brief Emit a horizontal reduction of the vectorized value.
Value *emitReduction(Value *VectorizedValue, IRBuilder<> &Builder) {
assert(VectorizedValue && "Need to have a vectorized tree node");
assert(isPowerOf2_32(ReduxWidth) &&
"We only handle power-of-two reductions for now");
Value *TmpVec = VectorizedValue;
for (unsigned i = ReduxWidth / 2; i != 0; i >>= 1) {
if (IsPairwiseReduction) {
Value *LeftMask =
createRdxShuffleMask(ReduxWidth, i, true, true, Builder);
Value *RightMask =
createRdxShuffleMask(ReduxWidth, i, true, false, Builder);
Value *LeftShuf = Builder.CreateShuffleVector(
TmpVec, UndefValue::get(TmpVec->getType()), LeftMask, "rdx.shuf.l");
Value *RightShuf = Builder.CreateShuffleVector(
TmpVec, UndefValue::get(TmpVec->getType()), (RightMask),
"rdx.shuf.r");
TmpVec = createBinOp(Builder, ReductionOpcode, LeftShuf, RightShuf,
"bin.rdx");
} else {
Value *UpperHalf =
createRdxShuffleMask(ReduxWidth, i, false, false, Builder);
Value *Shuf = Builder.CreateShuffleVector(
TmpVec, UndefValue::get(TmpVec->getType()), UpperHalf, "rdx.shuf");
TmpVec = createBinOp(Builder, ReductionOpcode, TmpVec, Shuf, "bin.rdx");
}
}
// The result is in the first element of the vector.
return Builder.CreateExtractElement(TmpVec, Builder.getInt32(0));
}
};
} // end anonymous namespace
/// \brief Recognize construction of vectors like
/// %ra = insertelement <4 x float> undef, float %s0, i32 0
/// %rb = insertelement <4 x float> %ra, float %s1, i32 1
/// %rc = insertelement <4 x float> %rb, float %s2, i32 2
/// %rd = insertelement <4 x float> %rc, float %s3, i32 3
///
/// Returns true if it matches
///
static bool findBuildVector(InsertElementInst *FirstInsertElem,
SmallVectorImpl<Value *> &BuildVector,
SmallVectorImpl<Value *> &BuildVectorOpds) {
if (!isa<UndefValue>(FirstInsertElem->getOperand(0)))
return false;
InsertElementInst *IE = FirstInsertElem;
while (true) {
BuildVector.push_back(IE);
BuildVectorOpds.push_back(IE->getOperand(1));
if (IE->use_empty())
return false;
InsertElementInst *NextUse = dyn_cast<InsertElementInst>(IE->user_back());
if (!NextUse)
return true;
// If this isn't the final use, make sure the next insertelement is the only
// use. It's OK if the final constructed vector is used multiple times
if (!IE->hasOneUse())
return false;
IE = NextUse;
}
return false;
}
/// \brief Like findBuildVector, but looks backwards for construction of aggregate.
///
/// \return true if it matches.
static bool findBuildAggregate(InsertValueInst *IV,
SmallVectorImpl<Value *> &BuildVector,
SmallVectorImpl<Value *> &BuildVectorOpds) {
if (!IV->hasOneUse())
return false;
Value *V = IV->getAggregateOperand();
if (!isa<UndefValue>(V)) {
InsertValueInst *I = dyn_cast<InsertValueInst>(V);
if (!I || !findBuildAggregate(I, BuildVector, BuildVectorOpds))
return false;
}
BuildVector.push_back(IV);
BuildVectorOpds.push_back(IV->getInsertedValueOperand());
return true;
}
static bool PhiTypeSorterFunc(Value *V, Value *V2) {
return V->getType() < V2->getType();
}
/// \brief Try and get a reduction value from a phi node.
///
/// Given a phi node \p P in a block \p ParentBB, consider possible reductions
/// if they come from either \p ParentBB or a containing loop latch.
///
/// \returns A candidate reduction value if possible, or \code nullptr \endcode
/// if not possible.
static Value *getReductionValue(const DominatorTree *DT, PHINode *P,
BasicBlock *ParentBB, LoopInfo *LI) {
// There are situations where the reduction value is not dominated by the
// reduction phi. Vectorizing such cases has been reported to cause
// miscompiles. See PR25787.
auto DominatedReduxValue = [&](Value *R) {
return (
dyn_cast<Instruction>(R) &&
DT->dominates(P->getParent(), dyn_cast<Instruction>(R)->getParent()));
};
Value *Rdx = nullptr;
// Return the incoming value if it comes from the same BB as the phi node.
if (P->getIncomingBlock(0) == ParentBB) {
Rdx = P->getIncomingValue(0);
} else if (P->getIncomingBlock(1) == ParentBB) {
Rdx = P->getIncomingValue(1);
}
if (Rdx && DominatedReduxValue(Rdx))
return Rdx;
// Otherwise, check whether we have a loop latch to look at.
Loop *BBL = LI->getLoopFor(ParentBB);
if (!BBL)
return nullptr;
BasicBlock *BBLatch = BBL->getLoopLatch();
if (!BBLatch)
return nullptr;
// There is a loop latch, return the incoming value if it comes from
// that. This reduction pattern occasionally turns up.
if (P->getIncomingBlock(0) == BBLatch) {
Rdx = P->getIncomingValue(0);
} else if (P->getIncomingBlock(1) == BBLatch) {
Rdx = P->getIncomingValue(1);
}
if (Rdx && DominatedReduxValue(Rdx))
return Rdx;
return nullptr;
}
-namespace {
-/// Tracks instructons and its children.
-class WeakVHWithLevel final : public CallbackVH {
- /// Operand index of the instruction currently beeing analized.
- unsigned Level = 0;
- /// Is this the instruction that should be vectorized, or are we now
- /// processing children (i.e. operands of this instruction) for potential
- /// vectorization?
- bool IsInitial = true;
-
-public:
- explicit WeakVHWithLevel() = default;
- WeakVHWithLevel(Value *V) : CallbackVH(V){};
- /// Restart children analysis each time it is repaced by the new instruction.
- void allUsesReplacedWith(Value *New) override {
- setValPtr(New);
- Level = 0;
- IsInitial = true;
- }
- /// Check if the instruction was not deleted during vectorization.
- bool isValid() const { return !getValPtr(); }
- /// Is the istruction itself must be vectorized?
- bool isInitial() const { return IsInitial; }
- /// Try to vectorize children.
- void clearInitial() { IsInitial = false; }
- /// Are all children processed already?
- bool isFinal() const {
- assert(getValPtr() &&
- (isa<Instruction>(getValPtr()) &&
- cast<Instruction>(getValPtr())->getNumOperands() >= Level));
- return getValPtr() &&
- cast<Instruction>(getValPtr())->getNumOperands() == Level;
- }
- /// Get next child operation.
- Value *nextOperand() {
- assert(getValPtr() && isa<Instruction>(getValPtr()) &&
- cast<Instruction>(getValPtr())->getNumOperands() > Level);
- return cast<Instruction>(getValPtr())->getOperand(Level++);
- }
- virtual ~WeakVHWithLevel() = default;
-};
-} // namespace
-
/// \brief Attempt to reduce a horizontal reduction.
/// If it is legal to match a horizontal reduction feeding
-/// the phi node P with reduction operators Root in a basic block BB, then check
-/// if it can be done.
+/// the phi node P with reduction operators BI, then check if it
+/// can be done.
/// \returns true if a horizontal reduction was matched and reduced.
/// \returns false if a horizontal reduction was not matched.
-static bool canBeVectorized(
- PHINode *P, Instruction *Root, BasicBlock *BB, BoUpSLP &R,
- TargetTransformInfo *TTI,
- const function_ref<bool(BinaryOperator *, BoUpSLP &)> Vectorize) {
+static bool canMatchHorizontalReduction(PHINode *P, BinaryOperator *BI,
+ BoUpSLP &R, TargetTransformInfo *TTI,
+ unsigned MinRegSize) {
if (!ShouldVectorizeHor)
return false;
- if (!Root)
+ HorizontalReduction HorRdx(MinRegSize);
+ if (!HorRdx.matchAssociativeReduction(P, BI))
return false;
- if (Root->getParent() != BB)
- return false;
- SmallVector<WeakVHWithLevel, 8> Stack(1, Root);
- SmallSet<Value *, 8> VisitedInstrs;
- bool Res = false;
- while (!Stack.empty()) {
- Value *V = Stack.back();
- if (!V) {
- Stack.pop_back();
- continue;
- }
- auto *Inst = dyn_cast<Instruction>(V);
- if (!Inst || isa<PHINode>(Inst)) {
- Stack.pop_back();
- continue;
- }
- if (Stack.back().isInitial()) {
- Stack.back().clearInitial();
- if (auto *BI = dyn_cast<BinaryOperator>(Inst)) {
- HorizontalReduction HorRdx(R.getMinVecRegSize());
- if (HorRdx.matchAssociativeReduction(P, BI)) {
- // If there is a sufficient number of reduction values, reduce
- // to a nearby power-of-2. Can safely generate oversized
- // vectors and rely on the backend to split them to legal sizes.
- HorRdx.ReduxWidth =
- std::max((uint64_t)4, PowerOf2Floor(HorRdx.numReductionValues()));
+ // If there is a sufficient number of reduction values, reduce
+ // to a nearby power-of-2. Can safely generate oversized
+ // vectors and rely on the backend to split them to legal sizes.
+ HorRdx.ReduxWidth =
+ std::max((uint64_t)4, PowerOf2Floor(HorRdx.numReductionValues()));
- if (HorRdx.tryToReduce(R, TTI)) {
- Res = true;
- P = nullptr;
- continue;
- }
- }
- if (P) {
- Inst = dyn_cast<Instruction>(BI->getOperand(0));
- if (Inst == P)
- Inst = dyn_cast<Instruction>(BI->getOperand(1));
- if (!Inst) {
- P = nullptr;
- continue;
- }
- }
- }
- P = nullptr;
- if (Vectorize(dyn_cast<BinaryOperator>(Inst), R)) {
- Res = true;
- continue;
- }
- }
- if (Stack.back().isFinal()) {
- Stack.pop_back();
- continue;
- }
-
- if (auto *NextV = dyn_cast<Instruction>(Stack.back().nextOperand()))
- if (NextV->getParent() == BB && VisitedInstrs.insert(NextV).second &&
- Stack.size() < RecursionMaxDepth)
- Stack.push_back(NextV);
- }
- return Res;
+ return HorRdx.tryToReduce(R, TTI);
}
-bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *P, Value *V,
- BasicBlock *BB, BoUpSLP &R,
- TargetTransformInfo *TTI) {
- if (!V)
- return false;
- auto *I = dyn_cast<Instruction>(V);
- if (!I)
- return false;
-
- if (!isa<BinaryOperator>(I))
- P = nullptr;
- // Try to match and vectorize a horizontal reduction.
- return canBeVectorized(P, I, BB, R, TTI,
- [this](BinaryOperator *BI, BoUpSLP &R) -> bool {
- return tryToVectorize(BI, R);
- });
-}
-
bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
bool Changed = false;
SmallVector<Value *, 4> Incoming;
SmallSet<Value *, 16> VisitedInstrs;
bool HaveVectorizedPhiNodes = true;
while (HaveVectorizedPhiNodes) {
HaveVectorizedPhiNodes = false;
// Collect the incoming values from the PHIs.
Incoming.clear();
for (Instruction &I : *BB) {
PHINode *P = dyn_cast<PHINode>(&I);
if (!P)
break;
if (!VisitedInstrs.count(P))
Incoming.push_back(P);
}
// Sort by type.
std::stable_sort(Incoming.begin(), Incoming.end(), PhiTypeSorterFunc);
// Try to vectorize elements base on their type.
for (SmallVector<Value *, 4>::iterator IncIt = Incoming.begin(),
E = Incoming.end();
IncIt != E;) {
// Look for the next elements with the same type.
SmallVector<Value *, 4>::iterator SameTypeIt = IncIt;
while (SameTypeIt != E &&
(*SameTypeIt)->getType() == (*IncIt)->getType()) {
VisitedInstrs.insert(*SameTypeIt);
++SameTypeIt;
}
// Try to vectorize them.
unsigned NumElts = (SameTypeIt - IncIt);
DEBUG(errs() << "SLP: Trying to vectorize starting at PHIs (" << NumElts << ")\n");
if (NumElts > 1 && tryToVectorizeList(makeArrayRef(IncIt, NumElts), R)) {
// Success start over because instructions might have been changed.
HaveVectorizedPhiNodes = true;
Changed = true;
break;
}
// Start over at the next instruction of a different type (or the end).
IncIt = SameTypeIt;
}
}
VisitedInstrs.clear();
for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; it++) {
// We may go through BB multiple times so skip the one we have checked.
if (!VisitedInstrs.insert(&*it).second)
continue;
if (isa<DbgInfoIntrinsic>(it))
continue;
// Try to vectorize reductions that use PHINodes.
if (PHINode *P = dyn_cast<PHINode>(it)) {
// Check that the PHI is a reduction PHI.
if (P->getNumIncomingValues() != 2)
return Changed;
+ Value *Rdx = getReductionValue(DT, P, BB, LI);
+
+ // Check if this is a Binary Operator.
+ BinaryOperator *BI = dyn_cast_or_null<BinaryOperator>(Rdx);
+ if (!BI)
+ continue;
+
// Try to match and vectorize a horizontal reduction.
- if (vectorizeRootInstruction(P, getReductionValue(DT, P, BB, LI), BB, R,
- TTI)) {
+ if (canMatchHorizontalReduction(P, BI, R, TTI, R.getMinVecRegSize())) {
Changed = true;
it = BB->begin();
e = BB->end();
continue;
}
+
+ Value *Inst = BI->getOperand(0);
+ if (Inst == P)
+ Inst = BI->getOperand(1);
+
+ if (tryToVectorize(dyn_cast<BinaryOperator>(Inst), R)) {
+ // We would like to start over since some instructions are deleted
+ // and the iterator may become invalid value.
+ Changed = true;
+ it = BB->begin();
+ e = BB->end();
+ continue;
+ }
+
continue;
}
- if (ShouldStartVectorizeHorAtStore) {
- if (StoreInst *SI = dyn_cast<StoreInst>(it)) {
- // Try to match and vectorize a horizontal reduction.
- if (vectorizeRootInstruction(nullptr, SI->getValueOperand(), BB, R,
- TTI)) {
- Changed = true;
- it = BB->begin();
- e = BB->end();
- continue;
+ if (ShouldStartVectorizeHorAtStore)
+ if (StoreInst *SI = dyn_cast<StoreInst>(it))
+ if (BinaryOperator *BinOp =
+ dyn_cast<BinaryOperator>(SI->getValueOperand())) {
+ if (canMatchHorizontalReduction(nullptr, BinOp, R, TTI,
+ R.getMinVecRegSize()) ||
+ tryToVectorize(BinOp, R)) {
+ Changed = true;
+ it = BB->begin();
+ e = BB->end();
+ continue;
+ }
}
- }
- }
// Try to vectorize horizontal reductions feeding into a return.
- if (ReturnInst *RI = dyn_cast<ReturnInst>(it)) {
- if (RI->getNumOperands() != 0) {
- // Try to match and vectorize a horizontal reduction.
- if (vectorizeRootInstruction(nullptr, RI->getOperand(0), BB, R, TTI)) {
- Changed = true;
- it = BB->begin();
- e = BB->end();
- continue;
+ if (ReturnInst *RI = dyn_cast<ReturnInst>(it))
+ if (RI->getNumOperands() != 0)
+ if (BinaryOperator *BinOp =
+ dyn_cast<BinaryOperator>(RI->getOperand(0))) {
+ DEBUG(dbgs() << "SLP: Found a return to vectorize.\n");
+ if (canMatchHorizontalReduction(nullptr, BinOp, R, TTI,
+ R.getMinVecRegSize()) ||
+ tryToVectorizePair(BinOp->getOperand(0), BinOp->getOperand(1),
+ R)) {
+ Changed = true;
+ it = BB->begin();
+ e = BB->end();
+ continue;
+ }
}
- }
- }
// Try to vectorize trees that start at compare instructions.
if (CmpInst *CI = dyn_cast<CmpInst>(it)) {
if (tryToVectorizePair(CI->getOperand(0), CI->getOperand(1), R)) {
Changed = true;
// We would like to start over since some instructions are deleted
// and the iterator may become invalid value.
it = BB->begin();
e = BB->end();
continue;
}
- for (int I = 0; I < 2; ++I) {
- if (vectorizeRootInstruction(nullptr, CI->getOperand(I), BB, R, TTI)) {
- Changed = true;
- // We would like to start over since some instructions are deleted
- // and the iterator may become invalid value.
- it = BB->begin();
- e = BB->end();
- break;
+ for (int i = 0; i < 2; ++i) {
+ if (BinaryOperator *BI = dyn_cast<BinaryOperator>(CI->getOperand(i))) {
+ if (tryToVectorizePair(BI->getOperand(0), BI->getOperand(1), R)) {
+ Changed = true;
+ // We would like to start over since some instructions are deleted
+ // and the iterator may become invalid value.
+ it = BB->begin();
+ e = BB->end();
+ break;
+ }
}
}
continue;
}
// Try to vectorize trees that start at insertelement instructions.
if (InsertElementInst *FirstInsertElem = dyn_cast<InsertElementInst>(it)) {
SmallVector<Value *, 16> BuildVector;
SmallVector<Value *, 16> BuildVectorOpds;
if (!findBuildVector(FirstInsertElem, BuildVector, BuildVectorOpds))
continue;
// Vectorize starting with the build vector operands ignoring the
// BuildVector instructions for the purpose of scheduling and user
// extraction.
if (tryToVectorizeList(BuildVectorOpds, R, BuildVector)) {
Changed = true;
it = BB->begin();
e = BB->end();
}
continue;
}
// Try to vectorize trees that start at insertvalue instructions feeding into
// a store.
if (StoreInst *SI = dyn_cast<StoreInst>(it)) {
if (InsertValueInst *LastInsertValue = dyn_cast<InsertValueInst>(SI->getValueOperand())) {
const DataLayout &DL = BB->getModule()->getDataLayout();
if (R.canMapToVector(SI->getValueOperand()->getType(), DL)) {
SmallVector<Value *, 16> BuildVector;
SmallVector<Value *, 16> BuildVectorOpds;
if (!findBuildAggregate(LastInsertValue, BuildVector, BuildVectorOpds))
continue;
DEBUG(dbgs() << "SLP: store of array mappable to vector: " << *SI << "\n");
if (tryToVectorizeList(BuildVectorOpds, R, BuildVector, false)) {
Changed = true;
it = BB->begin();
e = BB->end();
}
continue;
}
}
}
}
return Changed;
}
bool SLPVectorizerPass::vectorizeGEPIndices(BasicBlock *BB, BoUpSLP &R) {
auto Changed = false;
for (auto &Entry : GEPs) {
// If the getelementptr list has fewer than two elements, there's nothing
// to do.
if (Entry.second.size() < 2)
continue;
DEBUG(dbgs() << "SLP: Analyzing a getelementptr list of length "
<< Entry.second.size() << ".\n");
// We process the getelementptr list in chunks of 16 (like we do for
// stores) to minimize compile-time.
for (unsigned BI = 0, BE = Entry.second.size(); BI < BE; BI += 16) {
auto Len = std::min<unsigned>(BE - BI, 16);
auto GEPList = makeArrayRef(&Entry.second[BI], Len);
// Initialize a set a candidate getelementptrs. Note that we use a
// SetVector here to preserve program order. If the index computations
// are vectorizable and begin with loads, we want to minimize the chance
// of having to reorder them later.
SetVector<Value *> Candidates(GEPList.begin(), GEPList.end());
// Some of the candidates may have already been vectorized after we
// initially collected them. If so, the WeakVHs will have nullified the
// values, so remove them from the set of candidates.
Candidates.remove(nullptr);
// Remove from the set of candidates all pairs of getelementptrs with
// constant differences. Such getelementptrs are likely not good
// candidates for vectorization in a bottom-up phase since one can be
// computed from the other. We also ensure all candidate getelementptr
// indices are unique.
for (int I = 0, E = GEPList.size(); I < E && Candidates.size() > 1; ++I) {
auto *GEPI = cast<GetElementPtrInst>(GEPList[I]);
if (!Candidates.count(GEPI))
continue;
auto *SCEVI = SE->getSCEV(GEPList[I]);
for (int J = I + 1; J < E && Candidates.size() > 1; ++J) {
auto *GEPJ = cast<GetElementPtrInst>(GEPList[J]);
auto *SCEVJ = SE->getSCEV(GEPList[J]);
if (isa<SCEVConstant>(SE->getMinusSCEV(SCEVI, SCEVJ))) {
Candidates.remove(GEPList[I]);
Candidates.remove(GEPList[J]);
} else if (GEPI->idx_begin()->get() == GEPJ->idx_begin()->get()) {
Candidates.remove(GEPList[J]);
}
}
}
// We break out of the above computation as soon as we know there are
// fewer than two candidates remaining.
if (Candidates.size() < 2)
continue;
// Add the single, non-constant index of each candidate to the bundle. We
// ensured the indices met these constraints when we originally collected
// the getelementptrs.
SmallVector<Value *, 16> Bundle(Candidates.size());
auto BundleIndex = 0u;
for (auto *V : Candidates) {
auto *GEP = cast<GetElementPtrInst>(V);
auto *GEPIdx = GEP->idx_begin()->get();
assert(GEP->getNumIndices() == 1 || !isa<Constant>(GEPIdx));
Bundle[BundleIndex++] = GEPIdx;
}
// Try and vectorize the indices. We are currently only interested in
// gather-like cases of the form:
//
// ... = g[a[0] - b[0]] + g[a[1] - b[1]] + ...
//
// where the loads of "a", the loads of "b", and the subtractions can be
// performed in parallel. It's likely that detecting this pattern in a
// bottom-up phase will be simpler and less costly than building a
// full-blown top-down phase beginning at the consecutive loads.
Changed |= tryToVectorizeList(Bundle, R);
}
}
return Changed;
}
bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) {
bool Changed = false;
// Attempt to sort and vectorize each of the store-groups.
for (StoreListMap::iterator it = Stores.begin(), e = Stores.end(); it != e;
++it) {
if (it->second.size() < 2)
continue;
DEBUG(dbgs() << "SLP: Analyzing a store chain of length "
<< it->second.size() << ".\n");
// Process the stores in chunks of 16.
// TODO: The limit of 16 inhibits greater vectorization factors.
// For example, AVX2 supports v32i8. Increasing this limit, however,
// may cause a significant compile-time increase.
for (unsigned CI = 0, CE = it->second.size(); CI < CE; CI+=16) {
unsigned Len = std::min<unsigned>(CE - CI, 16);
Changed |= vectorizeStores(makeArrayRef(&it->second[CI], Len), R);
}
}
return Changed;
}
char SLPVectorizer::ID = 0;
static const char lv_name[] = "SLP Vectorizer";
INITIALIZE_PASS_BEGIN(SLPVectorizer, SV_NAME, lv_name, false, false)
INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)
INITIALIZE_PASS_END(SLPVectorizer, SV_NAME, lv_name, false, false)
namespace llvm {
Pass *createSLPVectorizerPass() { return new SLPVectorizer(); }
}
Index: projects/clang400-import/contrib/llvm/tools/clang/lib/CodeGen/CGOpenMPRuntime.cpp
===================================================================
--- projects/clang400-import/contrib/llvm/tools/clang/lib/CodeGen/CGOpenMPRuntime.cpp (revision 314268)
+++ projects/clang400-import/contrib/llvm/tools/clang/lib/CodeGen/CGOpenMPRuntime.cpp (revision 314269)
@@ -1,6795 +1,6797 @@
//===----- CGOpenMPRuntime.cpp - Interface to OpenMP Runtimes -------------===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
// This provides a class for OpenMP runtime code generation.
//
//===----------------------------------------------------------------------===//
#include "CGCXXABI.h"
#include "CGCleanup.h"
#include "CGOpenMPRuntime.h"
#include "CodeGenFunction.h"
#include "ConstantBuilder.h"
#include "clang/AST/Decl.h"
#include "clang/AST/StmtOpenMP.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/Bitcode/BitcodeReader.h"
#include "llvm/IR/CallSite.h"
#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/GlobalValue.h"
#include "llvm/IR/Value.h"
#include "llvm/Support/Format.h"
#include "llvm/Support/raw_ostream.h"
#include <cassert>
using namespace clang;
using namespace CodeGen;
namespace {
/// \brief Base class for handling code generation inside OpenMP regions.
class CGOpenMPRegionInfo : public CodeGenFunction::CGCapturedStmtInfo {
public:
/// \brief Kinds of OpenMP regions used in codegen.
enum CGOpenMPRegionKind {
/// \brief Region with outlined function for standalone 'parallel'
/// directive.
ParallelOutlinedRegion,
/// \brief Region with outlined function for standalone 'task' directive.
TaskOutlinedRegion,
/// \brief Region for constructs that do not require function outlining,
/// like 'for', 'sections', 'atomic' etc. directives.
InlinedRegion,
/// \brief Region with outlined function for standalone 'target' directive.
TargetRegion,
};
CGOpenMPRegionInfo(const CapturedStmt &CS,
const CGOpenMPRegionKind RegionKind,
const RegionCodeGenTy &CodeGen, OpenMPDirectiveKind Kind,
bool HasCancel)
: CGCapturedStmtInfo(CS, CR_OpenMP), RegionKind(RegionKind),
CodeGen(CodeGen), Kind(Kind), HasCancel(HasCancel) {}
CGOpenMPRegionInfo(const CGOpenMPRegionKind RegionKind,
const RegionCodeGenTy &CodeGen, OpenMPDirectiveKind Kind,
bool HasCancel)
: CGCapturedStmtInfo(CR_OpenMP), RegionKind(RegionKind), CodeGen(CodeGen),
Kind(Kind), HasCancel(HasCancel) {}
/// \brief Get a variable or parameter for storing global thread id
/// inside OpenMP construct.
virtual const VarDecl *getThreadIDVariable() const = 0;
/// \brief Emit the captured statement body.
void EmitBody(CodeGenFunction &CGF, const Stmt *S) override;
/// \brief Get an LValue for the current ThreadID variable.
/// \return LValue for thread id variable. This LValue always has type int32*.
virtual LValue getThreadIDVariableLValue(CodeGenFunction &CGF);
virtual void emitUntiedSwitch(CodeGenFunction & /*CGF*/) {}
CGOpenMPRegionKind getRegionKind() const { return RegionKind; }
OpenMPDirectiveKind getDirectiveKind() const { return Kind; }
bool hasCancel() const { return HasCancel; }
static bool classof(const CGCapturedStmtInfo *Info) {
return Info->getKind() == CR_OpenMP;
}
~CGOpenMPRegionInfo() override = default;
protected:
CGOpenMPRegionKind RegionKind;
RegionCodeGenTy CodeGen;
OpenMPDirectiveKind Kind;
bool HasCancel;
};
/// \brief API for captured statement code generation in OpenMP constructs.
class CGOpenMPOutlinedRegionInfo final : public CGOpenMPRegionInfo {
public:
CGOpenMPOutlinedRegionInfo(const CapturedStmt &CS, const VarDecl *ThreadIDVar,
const RegionCodeGenTy &CodeGen,
OpenMPDirectiveKind Kind, bool HasCancel,
StringRef HelperName)
: CGOpenMPRegionInfo(CS, ParallelOutlinedRegion, CodeGen, Kind,
HasCancel),
ThreadIDVar(ThreadIDVar), HelperName(HelperName) {
assert(ThreadIDVar != nullptr && "No ThreadID in OpenMP region.");
}
/// \brief Get a variable or parameter for storing global thread id
/// inside OpenMP construct.
const VarDecl *getThreadIDVariable() const override { return ThreadIDVar; }
/// \brief Get the name of the capture helper.
StringRef getHelperName() const override { return HelperName; }
static bool classof(const CGCapturedStmtInfo *Info) {
return CGOpenMPRegionInfo::classof(Info) &&
cast<CGOpenMPRegionInfo>(Info)->getRegionKind() ==
ParallelOutlinedRegion;
}
private:
/// \brief A variable or parameter storing global thread id for OpenMP
/// constructs.
const VarDecl *ThreadIDVar;
StringRef HelperName;
};
/// \brief API for captured statement code generation in OpenMP constructs.
class CGOpenMPTaskOutlinedRegionInfo final : public CGOpenMPRegionInfo {
public:
class UntiedTaskActionTy final : public PrePostActionTy {
bool Untied;
const VarDecl *PartIDVar;
const RegionCodeGenTy UntiedCodeGen;
llvm::SwitchInst *UntiedSwitch = nullptr;
public:
UntiedTaskActionTy(bool Tied, const VarDecl *PartIDVar,
const RegionCodeGenTy &UntiedCodeGen)
: Untied(!Tied), PartIDVar(PartIDVar), UntiedCodeGen(UntiedCodeGen) {}
void Enter(CodeGenFunction &CGF) override {
if (Untied) {
// Emit task switching point.
auto PartIdLVal = CGF.EmitLoadOfPointerLValue(
CGF.GetAddrOfLocalVar(PartIDVar),
PartIDVar->getType()->castAs<PointerType>());
auto *Res = CGF.EmitLoadOfScalar(PartIdLVal, SourceLocation());
auto *DoneBB = CGF.createBasicBlock(".untied.done.");
UntiedSwitch = CGF.Builder.CreateSwitch(Res, DoneBB);
CGF.EmitBlock(DoneBB);
CGF.EmitBranchThroughCleanup(CGF.ReturnBlock);
CGF.EmitBlock(CGF.createBasicBlock(".untied.jmp."));
UntiedSwitch->addCase(CGF.Builder.getInt32(0),
CGF.Builder.GetInsertBlock());
emitUntiedSwitch(CGF);
}
}
void emitUntiedSwitch(CodeGenFunction &CGF) const {
if (Untied) {
auto PartIdLVal = CGF.EmitLoadOfPointerLValue(
CGF.GetAddrOfLocalVar(PartIDVar),
PartIDVar->getType()->castAs<PointerType>());
CGF.EmitStoreOfScalar(CGF.Builder.getInt32(UntiedSwitch->getNumCases()),
PartIdLVal);
UntiedCodeGen(CGF);
CodeGenFunction::JumpDest CurPoint =
CGF.getJumpDestInCurrentScope(".untied.next.");
CGF.EmitBranchThroughCleanup(CGF.ReturnBlock);
CGF.EmitBlock(CGF.createBasicBlock(".untied.jmp."));
UntiedSwitch->addCase(CGF.Builder.getInt32(UntiedSwitch->getNumCases()),
CGF.Builder.GetInsertBlock());
CGF.EmitBranchThroughCleanup(CurPoint);
CGF.EmitBlock(CurPoint.getBlock());
}
}
unsigned getNumberOfParts() const { return UntiedSwitch->getNumCases(); }
};
CGOpenMPTaskOutlinedRegionInfo(const CapturedStmt &CS,
const VarDecl *ThreadIDVar,
const RegionCodeGenTy &CodeGen,
OpenMPDirectiveKind Kind, bool HasCancel,
const UntiedTaskActionTy &Action)
: CGOpenMPRegionInfo(CS, TaskOutlinedRegion, CodeGen, Kind, HasCancel),
ThreadIDVar(ThreadIDVar), Action(Action) {
assert(ThreadIDVar != nullptr && "No ThreadID in OpenMP region.");
}
/// \brief Get a variable or parameter for storing global thread id
/// inside OpenMP construct.
const VarDecl *getThreadIDVariable() const override { return ThreadIDVar; }
/// \brief Get an LValue for the current ThreadID variable.
LValue getThreadIDVariableLValue(CodeGenFunction &CGF) override;
/// \brief Get the name of the capture helper.
StringRef getHelperName() const override { return ".omp_outlined."; }
void emitUntiedSwitch(CodeGenFunction &CGF) override {
Action.emitUntiedSwitch(CGF);
}
static bool classof(const CGCapturedStmtInfo *Info) {
return CGOpenMPRegionInfo::classof(Info) &&
cast<CGOpenMPRegionInfo>(Info)->getRegionKind() ==
TaskOutlinedRegion;
}
private:
/// \brief A variable or parameter storing global thread id for OpenMP
/// constructs.
const VarDecl *ThreadIDVar;
/// Action for emitting code for untied tasks.
const UntiedTaskActionTy &Action;
};
/// \brief API for inlined captured statement code generation in OpenMP
/// constructs.
class CGOpenMPInlinedRegionInfo : public CGOpenMPRegionInfo {
public:
CGOpenMPInlinedRegionInfo(CodeGenFunction::CGCapturedStmtInfo *OldCSI,
const RegionCodeGenTy &CodeGen,
OpenMPDirectiveKind Kind, bool HasCancel)
: CGOpenMPRegionInfo(InlinedRegion, CodeGen, Kind, HasCancel),
OldCSI(OldCSI),
OuterRegionInfo(dyn_cast_or_null<CGOpenMPRegionInfo>(OldCSI)) {}
// \brief Retrieve the value of the context parameter.
llvm::Value *getContextValue() const override {
if (OuterRegionInfo)
return OuterRegionInfo->getContextValue();
llvm_unreachable("No context value for inlined OpenMP region");
}
void setContextValue(llvm::Value *V) override {
if (OuterRegionInfo) {
OuterRegionInfo->setContextValue(V);
return;
}
llvm_unreachable("No context value for inlined OpenMP region");
}
/// \brief Lookup the captured field decl for a variable.
const FieldDecl *lookup(const VarDecl *VD) const override {
if (OuterRegionInfo)
return OuterRegionInfo->lookup(VD);
// If there is no outer outlined region,no need to lookup in a list of
// captured variables, we can use the original one.
return nullptr;
}
FieldDecl *getThisFieldDecl() const override {
if (OuterRegionInfo)
return OuterRegionInfo->getThisFieldDecl();
return nullptr;
}
/// \brief Get a variable or parameter for storing global thread id
/// inside OpenMP construct.
const VarDecl *getThreadIDVariable() const override {
if (OuterRegionInfo)
return OuterRegionInfo->getThreadIDVariable();
return nullptr;
}
/// \brief Get the name of the capture helper.
StringRef getHelperName() const override {
if (auto *OuterRegionInfo = getOldCSI())
return OuterRegionInfo->getHelperName();
llvm_unreachable("No helper name for inlined OpenMP construct");
}
void emitUntiedSwitch(CodeGenFunction &CGF) override {
if (OuterRegionInfo)
OuterRegionInfo->emitUntiedSwitch(CGF);
}
CodeGenFunction::CGCapturedStmtInfo *getOldCSI() const { return OldCSI; }
static bool classof(const CGCapturedStmtInfo *Info) {
return CGOpenMPRegionInfo::classof(Info) &&
cast<CGOpenMPRegionInfo>(Info)->getRegionKind() == InlinedRegion;
}
~CGOpenMPInlinedRegionInfo() override = default;
private:
/// \brief CodeGen info about outer OpenMP region.
CodeGenFunction::CGCapturedStmtInfo *OldCSI;
CGOpenMPRegionInfo *OuterRegionInfo;
};
/// \brief API for captured statement code generation in OpenMP target
/// constructs. For this captures, implicit parameters are used instead of the
/// captured fields. The name of the target region has to be unique in a given
/// application so it is provided by the client, because only the client has
/// the information to generate that.
class CGOpenMPTargetRegionInfo final : public CGOpenMPRegionInfo {
public:
CGOpenMPTargetRegionInfo(const CapturedStmt &CS,
const RegionCodeGenTy &CodeGen, StringRef HelperName)
: CGOpenMPRegionInfo(CS, TargetRegion, CodeGen, OMPD_target,
/*HasCancel=*/false),
HelperName(HelperName) {}
/// \brief This is unused for target regions because each starts executing
/// with a single thread.
const VarDecl *getThreadIDVariable() const override { return nullptr; }
/// \brief Get the name of the capture helper.
StringRef getHelperName() const override { return HelperName; }
static bool classof(const CGCapturedStmtInfo *Info) {
return CGOpenMPRegionInfo::classof(Info) &&
cast<CGOpenMPRegionInfo>(Info)->getRegionKind() == TargetRegion;
}
private:
StringRef HelperName;
};
static void EmptyCodeGen(CodeGenFunction &, PrePostActionTy &) {
llvm_unreachable("No codegen for expressions");
}
/// \brief API for generation of expressions captured in a innermost OpenMP
/// region.
class CGOpenMPInnerExprInfo final : public CGOpenMPInlinedRegionInfo {
public:
CGOpenMPInnerExprInfo(CodeGenFunction &CGF, const CapturedStmt &CS)
: CGOpenMPInlinedRegionInfo(CGF.CapturedStmtInfo, EmptyCodeGen,
OMPD_unknown,
/*HasCancel=*/false),
PrivScope(CGF) {
// Make sure the globals captured in the provided statement are local by
// using the privatization logic. We assume the same variable is not
// captured more than once.
for (auto &C : CS.captures()) {
if (!C.capturesVariable() && !C.capturesVariableByCopy())
continue;
const VarDecl *VD = C.getCapturedVar();
if (VD->isLocalVarDeclOrParm())
continue;
DeclRefExpr DRE(const_cast<VarDecl *>(VD),
/*RefersToEnclosingVariableOrCapture=*/false,
VD->getType().getNonReferenceType(), VK_LValue,
SourceLocation());
PrivScope.addPrivate(VD, [&CGF, &DRE]() -> Address {
return CGF.EmitLValue(&DRE).getAddress();
});
}
(void)PrivScope.Privatize();
}
/// \brief Lookup the captured field decl for a variable.
const FieldDecl *lookup(const VarDecl *VD) const override {
if (auto *FD = CGOpenMPInlinedRegionInfo::lookup(VD))
return FD;
return nullptr;
}
/// \brief Emit the captured statement body.
void EmitBody(CodeGenFunction &CGF, const Stmt *S) override {
llvm_unreachable("No body for expressions");
}
/// \brief Get a variable or parameter for storing global thread id
/// inside OpenMP construct.
const VarDecl *getThreadIDVariable() const override {
llvm_unreachable("No thread id for expressions");
}
/// \brief Get the name of the capture helper.
StringRef getHelperName() const override {
llvm_unreachable("No helper name for expressions");
}
static bool classof(const CGCapturedStmtInfo *Info) { return false; }
private:
/// Private scope to capture global variables.
CodeGenFunction::OMPPrivateScope PrivScope;
};
/// \brief RAII for emitting code of OpenMP constructs.
class InlinedOpenMPRegionRAII {
CodeGenFunction &CGF;
llvm::DenseMap<const VarDecl *, FieldDecl *> LambdaCaptureFields;
FieldDecl *LambdaThisCaptureField = nullptr;
public:
/// \brief Constructs region for combined constructs.
/// \param CodeGen Code generation sequence for combined directives. Includes
/// a list of functions used for code generation of implicitly inlined
/// regions.
InlinedOpenMPRegionRAII(CodeGenFunction &CGF, const RegionCodeGenTy &CodeGen,
OpenMPDirectiveKind Kind, bool HasCancel)
: CGF(CGF) {
// Start emission for the construct.
CGF.CapturedStmtInfo = new CGOpenMPInlinedRegionInfo(
CGF.CapturedStmtInfo, CodeGen, Kind, HasCancel);
std::swap(CGF.LambdaCaptureFields, LambdaCaptureFields);
LambdaThisCaptureField = CGF.LambdaThisCaptureField;
CGF.LambdaThisCaptureField = nullptr;
}
~InlinedOpenMPRegionRAII() {
// Restore original CapturedStmtInfo only if we're done with code emission.
auto *OldCSI =
cast<CGOpenMPInlinedRegionInfo>(CGF.CapturedStmtInfo)->getOldCSI();
delete CGF.CapturedStmtInfo;
CGF.CapturedStmtInfo = OldCSI;
std::swap(CGF.LambdaCaptureFields, LambdaCaptureFields);
CGF.LambdaThisCaptureField = LambdaThisCaptureField;
}
};
/// \brief Values for bit flags used in the ident_t to describe the fields.
/// All enumeric elements are named and described in accordance with the code
/// from http://llvm.org/svn/llvm-project/openmp/trunk/runtime/src/kmp.h
enum OpenMPLocationFlags {
/// \brief Use trampoline for internal microtask.
OMP_IDENT_IMD = 0x01,
/// \brief Use c-style ident structure.
OMP_IDENT_KMPC = 0x02,
/// \brief Atomic reduction option for kmpc_reduce.
OMP_ATOMIC_REDUCE = 0x10,
/// \brief Explicit 'barrier' directive.
OMP_IDENT_BARRIER_EXPL = 0x20,
/// \brief Implicit barrier in code.
OMP_IDENT_BARRIER_IMPL = 0x40,
/// \brief Implicit barrier in 'for' directive.
OMP_IDENT_BARRIER_IMPL_FOR = 0x40,
/// \brief Implicit barrier in 'sections' directive.
OMP_IDENT_BARRIER_IMPL_SECTIONS = 0xC0,
/// \brief Implicit barrier in 'single' directive.
OMP_IDENT_BARRIER_IMPL_SINGLE = 0x140
};
/// \brief Describes ident structure that describes a source location.
/// All descriptions are taken from
/// http://llvm.org/svn/llvm-project/openmp/trunk/runtime/src/kmp.h
/// Original structure:
/// typedef struct ident {
/// kmp_int32 reserved_1; /**< might be used in Fortran;
/// see above */
/// kmp_int32 flags; /**< also f.flags; KMP_IDENT_xxx flags;
/// KMP_IDENT_KMPC identifies this union
/// member */
/// kmp_int32 reserved_2; /**< not really used in Fortran any more;
/// see above */
///#if USE_ITT_BUILD
/// /* but currently used for storing
/// region-specific ITT */
/// /* contextual information. */
///#endif /* USE_ITT_BUILD */
/// kmp_int32 reserved_3; /**< source[4] in Fortran, do not use for
/// C++ */
/// char const *psource; /**< String describing the source location.
/// The string is composed of semi-colon separated
// fields which describe the source file,
/// the function and a pair of line numbers that
/// delimit the construct.
/// */
/// } ident_t;
enum IdentFieldIndex {
/// \brief might be used in Fortran
IdentField_Reserved_1,
/// \brief OMP_IDENT_xxx flags; OMP_IDENT_KMPC identifies this union member.
IdentField_Flags,
/// \brief Not really used in Fortran any more
IdentField_Reserved_2,
/// \brief Source[4] in Fortran, do not use for C++
IdentField_Reserved_3,
/// \brief String describing the source location. The string is composed of
/// semi-colon separated fields which describe the source file, the function
/// and a pair of line numbers that delimit the construct.
IdentField_PSource
};
/// \brief Schedule types for 'omp for' loops (these enumerators are taken from
/// the enum sched_type in kmp.h).
enum OpenMPSchedType {
/// \brief Lower bound for default (unordered) versions.
OMP_sch_lower = 32,
OMP_sch_static_chunked = 33,
OMP_sch_static = 34,
OMP_sch_dynamic_chunked = 35,
OMP_sch_guided_chunked = 36,
OMP_sch_runtime = 37,
OMP_sch_auto = 38,
/// static with chunk adjustment (e.g., simd)
OMP_sch_static_balanced_chunked = 45,
/// \brief Lower bound for 'ordered' versions.
OMP_ord_lower = 64,
OMP_ord_static_chunked = 65,
OMP_ord_static = 66,
OMP_ord_dynamic_chunked = 67,
OMP_ord_guided_chunked = 68,
OMP_ord_runtime = 69,
OMP_ord_auto = 70,
OMP_sch_default = OMP_sch_static,
/// \brief dist_schedule types
OMP_dist_sch_static_chunked = 91,
OMP_dist_sch_static = 92,
/// Support for OpenMP 4.5 monotonic and nonmonotonic schedule modifiers.
/// Set if the monotonic schedule modifier was present.
OMP_sch_modifier_monotonic = (1 << 29),
/// Set if the nonmonotonic schedule modifier was present.
OMP_sch_modifier_nonmonotonic = (1 << 30),
};
enum OpenMPRTLFunction {
/// \brief Call to void __kmpc_fork_call(ident_t *loc, kmp_int32 argc,
/// kmpc_micro microtask, ...);
OMPRTL__kmpc_fork_call,
/// \brief Call to void *__kmpc_threadprivate_cached(ident_t *loc,
/// kmp_int32 global_tid, void *data, size_t size, void ***cache);
OMPRTL__kmpc_threadprivate_cached,
/// \brief Call to void __kmpc_threadprivate_register( ident_t *,
/// void *data, kmpc_ctor ctor, kmpc_cctor cctor, kmpc_dtor dtor);
OMPRTL__kmpc_threadprivate_register,
// Call to __kmpc_int32 kmpc_global_thread_num(ident_t *loc);
OMPRTL__kmpc_global_thread_num,
// Call to void __kmpc_critical(ident_t *loc, kmp_int32 global_tid,
// kmp_critical_name *crit);
OMPRTL__kmpc_critical,
// Call to void __kmpc_critical_with_hint(ident_t *loc, kmp_int32
// global_tid, kmp_critical_name *crit, uintptr_t hint);
OMPRTL__kmpc_critical_with_hint,
// Call to void __kmpc_end_critical(ident_t *loc, kmp_int32 global_tid,
// kmp_critical_name *crit);
OMPRTL__kmpc_end_critical,
// Call to kmp_int32 __kmpc_cancel_barrier(ident_t *loc, kmp_int32
// global_tid);
OMPRTL__kmpc_cancel_barrier,
// Call to void __kmpc_barrier(ident_t *loc, kmp_int32 global_tid);
OMPRTL__kmpc_barrier,
// Call to void __kmpc_for_static_fini(ident_t *loc, kmp_int32 global_tid);
OMPRTL__kmpc_for_static_fini,
// Call to void __kmpc_serialized_parallel(ident_t *loc, kmp_int32
// global_tid);
OMPRTL__kmpc_serialized_parallel,
// Call to void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32
// global_tid);
OMPRTL__kmpc_end_serialized_parallel,
// Call to void __kmpc_push_num_threads(ident_t *loc, kmp_int32 global_tid,
// kmp_int32 num_threads);
OMPRTL__kmpc_push_num_threads,
// Call to void __kmpc_flush(ident_t *loc);
OMPRTL__kmpc_flush,
// Call to kmp_int32 __kmpc_master(ident_t *, kmp_int32 global_tid);
OMPRTL__kmpc_master,
// Call to void __kmpc_end_master(ident_t *, kmp_int32 global_tid);
OMPRTL__kmpc_end_master,
// Call to kmp_int32 __kmpc_omp_taskyield(ident_t *, kmp_int32 global_tid,
// int end_part);
OMPRTL__kmpc_omp_taskyield,
// Call to kmp_int32 __kmpc_single(ident_t *, kmp_int32 global_tid);
OMPRTL__kmpc_single,
// Call to void __kmpc_end_single(ident_t *, kmp_int32 global_tid);
OMPRTL__kmpc_end_single,
// Call to kmp_task_t * __kmpc_omp_task_alloc(ident_t *, kmp_int32 gtid,
// kmp_int32 flags, size_t sizeof_kmp_task_t, size_t sizeof_shareds,
// kmp_routine_entry_t *task_entry);
OMPRTL__kmpc_omp_task_alloc,
// Call to kmp_int32 __kmpc_omp_task(ident_t *, kmp_int32 gtid, kmp_task_t *
// new_task);
OMPRTL__kmpc_omp_task,
// Call to void __kmpc_copyprivate(ident_t *loc, kmp_int32 global_tid,
// size_t cpy_size, void *cpy_data, void(*cpy_func)(void *, void *),
// kmp_int32 didit);
OMPRTL__kmpc_copyprivate,
// Call to kmp_int32 __kmpc_reduce(ident_t *loc, kmp_int32 global_tid,
// kmp_int32 num_vars, size_t reduce_size, void *reduce_data, void
// (*reduce_func)(void *lhs_data, void *rhs_data), kmp_critical_name *lck);
OMPRTL__kmpc_reduce,
// Call to kmp_int32 __kmpc_reduce_nowait(ident_t *loc, kmp_int32
// global_tid, kmp_int32 num_vars, size_t reduce_size, void *reduce_data,
// void (*reduce_func)(void *lhs_data, void *rhs_data), kmp_critical_name
// *lck);
OMPRTL__kmpc_reduce_nowait,
// Call to void __kmpc_end_reduce(ident_t *loc, kmp_int32 global_tid,
// kmp_critical_name *lck);
OMPRTL__kmpc_end_reduce,
// Call to void __kmpc_end_reduce_nowait(ident_t *loc, kmp_int32 global_tid,
// kmp_critical_name *lck);
OMPRTL__kmpc_end_reduce_nowait,
// Call to void __kmpc_omp_task_begin_if0(ident_t *, kmp_int32 gtid,
// kmp_task_t * new_task);
OMPRTL__kmpc_omp_task_begin_if0,
// Call to void __kmpc_omp_task_complete_if0(ident_t *, kmp_int32 gtid,
// kmp_task_t * new_task);
OMPRTL__kmpc_omp_task_complete_if0,
// Call to void __kmpc_ordered(ident_t *loc, kmp_int32 global_tid);
OMPRTL__kmpc_ordered,
// Call to void __kmpc_end_ordered(ident_t *loc, kmp_int32 global_tid);
OMPRTL__kmpc_end_ordered,
// Call to kmp_int32 __kmpc_omp_taskwait(ident_t *loc, kmp_int32
// global_tid);
OMPRTL__kmpc_omp_taskwait,
// Call to void __kmpc_taskgroup(ident_t *loc, kmp_int32 global_tid);
OMPRTL__kmpc_taskgroup,
// Call to void __kmpc_end_taskgroup(ident_t *loc, kmp_int32 global_tid);
OMPRTL__kmpc_end_taskgroup,
// Call to void __kmpc_push_proc_bind(ident_t *loc, kmp_int32 global_tid,
// int proc_bind);
OMPRTL__kmpc_push_proc_bind,
// Call to kmp_int32 __kmpc_omp_task_with_deps(ident_t *loc_ref, kmp_int32
// gtid, kmp_task_t * new_task, kmp_int32 ndeps, kmp_depend_info_t
// *dep_list, kmp_int32 ndeps_noalias, kmp_depend_info_t *noalias_dep_list);
OMPRTL__kmpc_omp_task_with_deps,
// Call to void __kmpc_omp_wait_deps(ident_t *loc_ref, kmp_int32
// gtid, kmp_int32 ndeps, kmp_depend_info_t *dep_list, kmp_int32
// ndeps_noalias, kmp_depend_info_t *noalias_dep_list);
OMPRTL__kmpc_omp_wait_deps,
// Call to kmp_int32 __kmpc_cancellationpoint(ident_t *loc, kmp_int32
// global_tid, kmp_int32 cncl_kind);
OMPRTL__kmpc_cancellationpoint,
// Call to kmp_int32 __kmpc_cancel(ident_t *loc, kmp_int32 global_tid,
// kmp_int32 cncl_kind);
OMPRTL__kmpc_cancel,
// Call to void __kmpc_push_num_teams(ident_t *loc, kmp_int32 global_tid,
// kmp_int32 num_teams, kmp_int32 thread_limit);
OMPRTL__kmpc_push_num_teams,
// Call to void __kmpc_fork_teams(ident_t *loc, kmp_int32 argc, kmpc_micro
// microtask, ...);
OMPRTL__kmpc_fork_teams,
// Call to void __kmpc_taskloop(ident_t *loc, int gtid, kmp_task_t *task, int
// if_val, kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st, int nogroup, int
// sched, kmp_uint64 grainsize, void *task_dup);
OMPRTL__kmpc_taskloop,
// Call to void __kmpc_doacross_init(ident_t *loc, kmp_int32 gtid, kmp_int32
// num_dims, struct kmp_dim *dims);
OMPRTL__kmpc_doacross_init,
// Call to void __kmpc_doacross_fini(ident_t *loc, kmp_int32 gtid);
OMPRTL__kmpc_doacross_fini,
// Call to void __kmpc_doacross_post(ident_t *loc, kmp_int32 gtid, kmp_int64
// *vec);
OMPRTL__kmpc_doacross_post,
// Call to void __kmpc_doacross_wait(ident_t *loc, kmp_int32 gtid, kmp_int64
// *vec);
OMPRTL__kmpc_doacross_wait,
//
// Offloading related calls
//
// Call to int32_t __tgt_target(int32_t device_id, void *host_ptr, int32_t
// arg_num, void** args_base, void **args, size_t *arg_sizes, int32_t
// *arg_types);
OMPRTL__tgt_target,
// Call to int32_t __tgt_target_teams(int32_t device_id, void *host_ptr,
// int32_t arg_num, void** args_base, void **args, size_t *arg_sizes,
// int32_t *arg_types, int32_t num_teams, int32_t thread_limit);
OMPRTL__tgt_target_teams,
// Call to void __tgt_register_lib(__tgt_bin_desc *desc);
OMPRTL__tgt_register_lib,
// Call to void __tgt_unregister_lib(__tgt_bin_desc *desc);
OMPRTL__tgt_unregister_lib,
// Call to void __tgt_target_data_begin(int32_t device_id, int32_t arg_num,
// void** args_base, void **args, size_t *arg_sizes, int32_t *arg_types);
OMPRTL__tgt_target_data_begin,
// Call to void __tgt_target_data_end(int32_t device_id, int32_t arg_num,
// void** args_base, void **args, size_t *arg_sizes, int32_t *arg_types);
OMPRTL__tgt_target_data_end,
// Call to void __tgt_target_data_update(int32_t device_id, int32_t arg_num,
// void** args_base, void **args, size_t *arg_sizes, int32_t *arg_types);
OMPRTL__tgt_target_data_update,
};
/// A basic class for pre|post-action for advanced codegen sequence for OpenMP
/// region.
class CleanupTy final : public EHScopeStack::Cleanup {
PrePostActionTy *Action;
public:
explicit CleanupTy(PrePostActionTy *Action) : Action(Action) {}
void Emit(CodeGenFunction &CGF, Flags /*flags*/) override {
if (!CGF.HaveInsertPoint())
return;
Action->Exit(CGF);
}
};
} // anonymous namespace
void RegionCodeGenTy::operator()(CodeGenFunction &CGF) const {
CodeGenFunction::RunCleanupsScope Scope(CGF);
if (PrePostAction) {
CGF.EHStack.pushCleanup<CleanupTy>(NormalAndEHCleanup, PrePostAction);
Callback(CodeGen, CGF, *PrePostAction);
} else {
PrePostActionTy Action;
Callback(CodeGen, CGF, Action);
}
}
LValue CGOpenMPRegionInfo::getThreadIDVariableLValue(CodeGenFunction &CGF) {
return CGF.EmitLoadOfPointerLValue(
CGF.GetAddrOfLocalVar(getThreadIDVariable()),
getThreadIDVariable()->getType()->castAs<PointerType>());
}
void CGOpenMPRegionInfo::EmitBody(CodeGenFunction &CGF, const Stmt * /*S*/) {
if (!CGF.HaveInsertPoint())
return;
// 1.2.2 OpenMP Language Terminology
// Structured block - An executable statement with a single entry at the
// top and a single exit at the bottom.
// The point of exit cannot be a branch out of the structured block.
// longjmp() and throw() must not violate the entry/exit criteria.
CGF.EHStack.pushTerminate();
CodeGen(CGF);
CGF.EHStack.popTerminate();
}
LValue CGOpenMPTaskOutlinedRegionInfo::getThreadIDVariableLValue(
CodeGenFunction &CGF) {
return CGF.MakeAddrLValue(CGF.GetAddrOfLocalVar(getThreadIDVariable()),
getThreadIDVariable()->getType(),
AlignmentSource::Decl);
}
CGOpenMPRuntime::CGOpenMPRuntime(CodeGenModule &CGM)
: CGM(CGM), OffloadEntriesInfoManager(CGM) {
IdentTy = llvm::StructType::create(
"ident_t", CGM.Int32Ty /* reserved_1 */, CGM.Int32Ty /* flags */,
CGM.Int32Ty /* reserved_2 */, CGM.Int32Ty /* reserved_3 */,
CGM.Int8PtrTy /* psource */, nullptr);
KmpCriticalNameTy = llvm::ArrayType::get(CGM.Int32Ty, /*NumElements*/ 8);
loadOffloadInfoMetadata();
}
void CGOpenMPRuntime::clear() {
InternalVars.clear();
}
static llvm::Function *
emitCombinerOrInitializer(CodeGenModule &CGM, QualType Ty,
const Expr *CombinerInitializer, const VarDecl *In,
const VarDecl *Out, bool IsCombiner) {
// void .omp_combiner.(Ty *in, Ty *out);
auto &C = CGM.getContext();
QualType PtrTy = C.getPointerType(Ty).withRestrict();
FunctionArgList Args;
ImplicitParamDecl OmpOutParm(C, /*DC=*/nullptr, Out->getLocation(),
/*Id=*/nullptr, PtrTy);
ImplicitParamDecl OmpInParm(C, /*DC=*/nullptr, In->getLocation(),
/*Id=*/nullptr, PtrTy);
Args.push_back(&OmpOutParm);
Args.push_back(&OmpInParm);
auto &FnInfo =
CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
auto *FnTy = CGM.getTypes().GetFunctionType(FnInfo);
auto *Fn = llvm::Function::Create(
FnTy, llvm::GlobalValue::InternalLinkage,
IsCombiner ? ".omp_combiner." : ".omp_initializer.", &CGM.getModule());
CGM.SetInternalFunctionAttributes(/*D=*/nullptr, Fn, FnInfo);
Fn->removeFnAttr(llvm::Attribute::NoInline);
Fn->addFnAttr(llvm::Attribute::AlwaysInline);
CodeGenFunction CGF(CGM);
// Map "T omp_in;" variable to "*omp_in_parm" value in all expressions.
// Map "T omp_out;" variable to "*omp_out_parm" value in all expressions.
CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, FnInfo, Args);
CodeGenFunction::OMPPrivateScope Scope(CGF);
Address AddrIn = CGF.GetAddrOfLocalVar(&OmpInParm);
Scope.addPrivate(In, [&CGF, AddrIn, PtrTy]() -> Address {
return CGF.EmitLoadOfPointerLValue(AddrIn, PtrTy->castAs<PointerType>())
.getAddress();
});
Address AddrOut = CGF.GetAddrOfLocalVar(&OmpOutParm);
Scope.addPrivate(Out, [&CGF, AddrOut, PtrTy]() -> Address {
return CGF.EmitLoadOfPointerLValue(AddrOut, PtrTy->castAs<PointerType>())
.getAddress();
});
(void)Scope.Privatize();
CGF.EmitIgnoredExpr(CombinerInitializer);
Scope.ForceCleanup();
CGF.FinishFunction();
return Fn;
}
void CGOpenMPRuntime::emitUserDefinedReduction(
CodeGenFunction *CGF, const OMPDeclareReductionDecl *D) {
if (UDRMap.count(D) > 0)
return;
auto &C = CGM.getContext();
if (!In || !Out) {
In = &C.Idents.get("omp_in");
Out = &C.Idents.get("omp_out");
}
llvm::Function *Combiner = emitCombinerOrInitializer(
CGM, D->getType(), D->getCombiner(), cast<VarDecl>(D->lookup(In).front()),
cast<VarDecl>(D->lookup(Out).front()),
/*IsCombiner=*/true);
llvm::Function *Initializer = nullptr;
if (auto *Init = D->getInitializer()) {
if (!Priv || !Orig) {
Priv = &C.Idents.get("omp_priv");
Orig = &C.Idents.get("omp_orig");
}
Initializer = emitCombinerOrInitializer(
CGM, D->getType(), Init, cast<VarDecl>(D->lookup(Orig).front()),
cast<VarDecl>(D->lookup(Priv).front()),
/*IsCombiner=*/false);
}
UDRMap.insert(std::make_pair(D, std::make_pair(Combiner, Initializer)));
if (CGF) {
auto &Decls = FunctionUDRMap.FindAndConstruct(CGF->CurFn);
Decls.second.push_back(D);
}
}
std::pair<llvm::Function *, llvm::Function *>
CGOpenMPRuntime::getUserDefinedReduction(const OMPDeclareReductionDecl *D) {
auto I = UDRMap.find(D);
if (I != UDRMap.end())
return I->second;
emitUserDefinedReduction(/*CGF=*/nullptr, D);
return UDRMap.lookup(D);
}
// Layout information for ident_t.
static CharUnits getIdentAlign(CodeGenModule &CGM) {
return CGM.getPointerAlign();
}
static CharUnits getIdentSize(CodeGenModule &CGM) {
assert((4 * CGM.getPointerSize()).isMultipleOf(CGM.getPointerAlign()));
return CharUnits::fromQuantity(16) + CGM.getPointerSize();
}
static CharUnits getOffsetOfIdentField(IdentFieldIndex Field) {
// All the fields except the last are i32, so this works beautifully.
return unsigned(Field) * CharUnits::fromQuantity(4);
}
static Address createIdentFieldGEP(CodeGenFunction &CGF, Address Addr,
IdentFieldIndex Field,
const llvm::Twine &Name = "") {
auto Offset = getOffsetOfIdentField(Field);
return CGF.Builder.CreateStructGEP(Addr, Field, Offset, Name);
}
llvm::Value *CGOpenMPRuntime::emitParallelOrTeamsOutlinedFunction(
const OMPExecutableDirective &D, const VarDecl *ThreadIDVar,
OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) {
assert(ThreadIDVar->getType()->isPointerType() &&
"thread id variable must be of type kmp_int32 *");
const CapturedStmt *CS = cast<CapturedStmt>(D.getAssociatedStmt());
CodeGenFunction CGF(CGM, true);
bool HasCancel = false;
if (auto *OPD = dyn_cast<OMPParallelDirective>(&D))
HasCancel = OPD->hasCancel();
else if (auto *OPSD = dyn_cast<OMPParallelSectionsDirective>(&D))
HasCancel = OPSD->hasCancel();
else if (auto *OPFD = dyn_cast<OMPParallelForDirective>(&D))
HasCancel = OPFD->hasCancel();
CGOpenMPOutlinedRegionInfo CGInfo(*CS, ThreadIDVar, CodeGen, InnermostKind,
HasCancel, getOutlinedHelperName());
CodeGenFunction::CGCapturedStmtRAII CapInfoRAII(CGF, &CGInfo);
return CGF.GenerateOpenMPCapturedStmtFunction(*CS);
}
llvm::Value *CGOpenMPRuntime::emitTaskOutlinedFunction(
const OMPExecutableDirective &D, const VarDecl *ThreadIDVar,
const VarDecl *PartIDVar, const VarDecl *TaskTVar,
OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen,
bool Tied, unsigned &NumberOfParts) {
auto &&UntiedCodeGen = [this, &D, TaskTVar](CodeGenFunction &CGF,
PrePostActionTy &) {
auto *ThreadID = getThreadID(CGF, D.getLocStart());
auto *UpLoc = emitUpdateLocation(CGF, D.getLocStart());
llvm::Value *TaskArgs[] = {
UpLoc, ThreadID,
CGF.EmitLoadOfPointerLValue(CGF.GetAddrOfLocalVar(TaskTVar),
TaskTVar->getType()->castAs<PointerType>())
.getPointer()};
CGF.EmitRuntimeCall(createRuntimeFunction(OMPRTL__kmpc_omp_task), TaskArgs);
};
CGOpenMPTaskOutlinedRegionInfo::UntiedTaskActionTy Action(Tied, PartIDVar,
UntiedCodeGen);
CodeGen.setAction(Action);
assert(!ThreadIDVar->getType()->isPointerType() &&
"thread id variable must be of type kmp_int32 for tasks");
auto *CS = cast<CapturedStmt>(D.getAssociatedStmt());
auto *TD = dyn_cast<OMPTaskDirective>(&D);
CodeGenFunction CGF(CGM, true);
CGOpenMPTaskOutlinedRegionInfo CGInfo(*CS, ThreadIDVar, CodeGen,
InnermostKind,
TD ? TD->hasCancel() : false, Action);
CodeGenFunction::CGCapturedStmtRAII CapInfoRAII(CGF, &CGInfo);
auto *Res = CGF.GenerateCapturedStmtFunction(*CS);
if (!Tied)
NumberOfParts = Action.getNumberOfParts();
return Res;
}
Address CGOpenMPRuntime::getOrCreateDefaultLocation(unsigned Flags) {
CharUnits Align = getIdentAlign(CGM);
llvm::Value *Entry = OpenMPDefaultLocMap.lookup(Flags);
if (!Entry) {
if (!DefaultOpenMPPSource) {
// Initialize default location for psource field of ident_t structure of
// all ident_t objects. Format is ";file;function;line;column;;".
// Taken from
// http://llvm.org/svn/llvm-project/openmp/trunk/runtime/src/kmp_str.c
DefaultOpenMPPSource =
CGM.GetAddrOfConstantCString(";unknown;unknown;0;0;;").getPointer();
DefaultOpenMPPSource =
llvm::ConstantExpr::getBitCast(DefaultOpenMPPSource, CGM.Int8PtrTy);
}
ConstantInitBuilder builder(CGM);
auto fields = builder.beginStruct(IdentTy);
fields.addInt(CGM.Int32Ty, 0);
fields.addInt(CGM.Int32Ty, Flags);
fields.addInt(CGM.Int32Ty, 0);
fields.addInt(CGM.Int32Ty, 0);
fields.add(DefaultOpenMPPSource);
auto DefaultOpenMPLocation =
fields.finishAndCreateGlobal("", Align, /*isConstant*/ true,
llvm::GlobalValue::PrivateLinkage);
DefaultOpenMPLocation->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global);
OpenMPDefaultLocMap[Flags] = Entry = DefaultOpenMPLocation;
}
return Address(Entry, Align);
}
llvm::Value *CGOpenMPRuntime::emitUpdateLocation(CodeGenFunction &CGF,
SourceLocation Loc,
unsigned Flags) {
Flags |= OMP_IDENT_KMPC;
// If no debug info is generated - return global default location.
if (CGM.getCodeGenOpts().getDebugInfo() == codegenoptions::NoDebugInfo ||
Loc.isInvalid())
return getOrCreateDefaultLocation(Flags).getPointer();
assert(CGF.CurFn && "No function in current CodeGenFunction.");
Address LocValue = Address::invalid();
auto I = OpenMPLocThreadIDMap.find(CGF.CurFn);
if (I != OpenMPLocThreadIDMap.end())
LocValue = Address(I->second.DebugLoc, getIdentAlign(CGF.CGM));
// OpenMPLocThreadIDMap may have null DebugLoc and non-null ThreadID, if
// GetOpenMPThreadID was called before this routine.
if (!LocValue.isValid()) {
// Generate "ident_t .kmpc_loc.addr;"
Address AI = CGF.CreateTempAlloca(IdentTy, getIdentAlign(CGF.CGM),
".kmpc_loc.addr");
auto &Elem = OpenMPLocThreadIDMap.FindAndConstruct(CGF.CurFn);
Elem.second.DebugLoc = AI.getPointer();
LocValue = AI;
CGBuilderTy::InsertPointGuard IPG(CGF.Builder);
CGF.Builder.SetInsertPoint(CGF.AllocaInsertPt);
CGF.Builder.CreateMemCpy(LocValue, getOrCreateDefaultLocation(Flags),
CGM.getSize(getIdentSize(CGF.CGM)));
}
// char **psource = &.kmpc_loc_<flags>.addr.psource;
Address PSource = createIdentFieldGEP(CGF, LocValue, IdentField_PSource);
auto OMPDebugLoc = OpenMPDebugLocMap.lookup(Loc.getRawEncoding());
if (OMPDebugLoc == nullptr) {
SmallString<128> Buffer2;
llvm::raw_svector_ostream OS2(Buffer2);
// Build debug location
PresumedLoc PLoc = CGF.getContext().getSourceManager().getPresumedLoc(Loc);
OS2 << ";" << PLoc.getFilename() << ";";
if (const FunctionDecl *FD =
dyn_cast_or_null<FunctionDecl>(CGF.CurFuncDecl)) {
OS2 << FD->getQualifiedNameAsString();
}
OS2 << ";" << PLoc.getLine() << ";" << PLoc.getColumn() << ";;";
OMPDebugLoc = CGF.Builder.CreateGlobalStringPtr(OS2.str());
OpenMPDebugLocMap[Loc.getRawEncoding()] = OMPDebugLoc;
}
// *psource = ";<File>;<Function>;<Line>;<Column>;;";
CGF.Builder.CreateStore(OMPDebugLoc, PSource);
// Our callers always pass this to a runtime function, so for
// convenience, go ahead and return a naked pointer.
return LocValue.getPointer();
}
llvm::Value *CGOpenMPRuntime::getThreadID(CodeGenFunction &CGF,
SourceLocation Loc) {
assert(CGF.CurFn && "No function in current CodeGenFunction.");
llvm::Value *ThreadID = nullptr;
// Check whether we've already cached a load of the thread id in this
// function.
auto I = OpenMPLocThreadIDMap.find(CGF.CurFn);
if (I != OpenMPLocThreadIDMap.end()) {
ThreadID = I->second.ThreadID;
if (ThreadID != nullptr)
return ThreadID;
}
if (auto *OMPRegionInfo =
dyn_cast_or_null<CGOpenMPRegionInfo>(CGF.CapturedStmtInfo)) {
if (OMPRegionInfo->getThreadIDVariable()) {
// Check if this an outlined function with thread id passed as argument.
auto LVal = OMPRegionInfo->getThreadIDVariableLValue(CGF);
ThreadID = CGF.EmitLoadOfLValue(LVal, Loc).getScalarVal();
// If value loaded in entry block, cache it and use it everywhere in
// function.
if (CGF.Builder.GetInsertBlock() == CGF.AllocaInsertPt->getParent()) {
auto &Elem = OpenMPLocThreadIDMap.FindAndConstruct(CGF.CurFn);
Elem.second.ThreadID = ThreadID;
}
return ThreadID;
}
}
// This is not an outlined function region - need to call __kmpc_int32
// kmpc_global_thread_num(ident_t *loc).
// Generate thread id value and cache this value for use across the
// function.
CGBuilderTy::InsertPointGuard IPG(CGF.Builder);
CGF.Builder.SetInsertPoint(CGF.AllocaInsertPt);
ThreadID =
CGF.EmitRuntimeCall(createRuntimeFunction(OMPRTL__kmpc_global_thread_num),
emitUpdateLocation(CGF, Loc));
auto &Elem = OpenMPLocThreadIDMap.FindAndConstruct(CGF.CurFn);
Elem.second.ThreadID = ThreadID;
return ThreadID;
}
void CGOpenMPRuntime::functionFinished(CodeGenFunction &CGF) {
assert(CGF.CurFn && "No function in current CodeGenFunction.");
if (OpenMPLocThreadIDMap.count(CGF.CurFn))
OpenMPLocThreadIDMap.erase(CGF.CurFn);
if (FunctionUDRMap.count(CGF.CurFn) > 0) {
for(auto *D : FunctionUDRMap[CGF.CurFn]) {
UDRMap.erase(D);
}
FunctionUDRMap.erase(CGF.CurFn);
}
}
llvm::Type *CGOpenMPRuntime::getIdentTyPointerTy() {
if (!IdentTy) {
}
return llvm::PointerType::getUnqual(IdentTy);
}
llvm::Type *CGOpenMPRuntime::getKmpc_MicroPointerTy() {
if (!Kmpc_MicroTy) {
// Build void (*kmpc_micro)(kmp_int32 *global_tid, kmp_int32 *bound_tid,...)
llvm::Type *MicroParams[] = {llvm::PointerType::getUnqual(CGM.Int32Ty),
llvm::PointerType::getUnqual(CGM.Int32Ty)};
Kmpc_MicroTy = llvm::FunctionType::get(CGM.VoidTy, MicroParams, true);
}
return llvm::PointerType::getUnqual(Kmpc_MicroTy);
}
llvm::Constant *
CGOpenMPRuntime::createRuntimeFunction(unsigned Function) {
llvm::Constant *RTLFn = nullptr;
switch (static_cast<OpenMPRTLFunction>(Function)) {
case OMPRTL__kmpc_fork_call: {
// Build void __kmpc_fork_call(ident_t *loc, kmp_int32 argc, kmpc_micro
// microtask, ...);
llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty,
getKmpc_MicroPointerTy()};
llvm::FunctionType *FnTy =
llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ true);
RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_fork_call");
break;
}
case OMPRTL__kmpc_global_thread_num: {
// Build kmp_int32 __kmpc_global_thread_num(ident_t *loc);
llvm::Type *TypeParams[] = {getIdentTyPointerTy()};
llvm::FunctionType *FnTy =
llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg*/ false);
RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_global_thread_num");
break;
}
case OMPRTL__kmpc_threadprivate_cached: {
// Build void *__kmpc_threadprivate_cached(ident_t *loc,
// kmp_int32 global_tid, void *data, size_t size, void ***cache);
llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty,
CGM.VoidPtrTy, CGM.SizeTy,
CGM.VoidPtrTy->getPointerTo()->getPointerTo()};
llvm::FunctionType *FnTy =
llvm::FunctionType::get(CGM.VoidPtrTy, TypeParams, /*isVarArg*/ false);
RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_threadprivate_cached");
break;
}
case OMPRTL__kmpc_critical: {
// Build void __kmpc_critical(ident_t *loc, kmp_int32 global_tid,
// kmp_critical_name *crit);
llvm::Type *TypeParams[] = {
getIdentTyPointerTy(), CGM.Int32Ty,
llvm::PointerType::getUnqual(KmpCriticalNameTy)};
llvm::FunctionType *FnTy =
llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_critical");
break;
}
case OMPRTL__kmpc_critical_with_hint: {
// Build void __kmpc_critical_with_hint(ident_t *loc, kmp_int32 global_tid,
// kmp_critical_name *crit, uintptr_t hint);
llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty,
llvm::PointerType::getUnqual(KmpCriticalNameTy),
CGM.IntPtrTy};
llvm::FunctionType *FnTy =
llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_critical_with_hint");
break;
}
case OMPRTL__kmpc_threadprivate_register: {
// Build void __kmpc_threadprivate_register(ident_t *, void *data,
// kmpc_ctor ctor, kmpc_cctor cctor, kmpc_dtor dtor);
// typedef void *(*kmpc_ctor)(void *);
auto KmpcCtorTy =
llvm::FunctionType::get(CGM.VoidPtrTy, CGM.VoidPtrTy,
/*isVarArg*/ false)->getPointerTo();
// typedef void *(*kmpc_cctor)(void *, void *);
llvm::Type *KmpcCopyCtorTyArgs[] = {CGM.VoidPtrTy, CGM.VoidPtrTy};
auto KmpcCopyCtorTy =
llvm::FunctionType::get(CGM.VoidPtrTy, KmpcCopyCtorTyArgs,
/*isVarArg*/ false)->getPointerTo();
// typedef void (*kmpc_dtor)(void *);
auto KmpcDtorTy =
llvm::FunctionType::get(CGM.VoidTy, CGM.VoidPtrTy, /*isVarArg*/ false)
->getPointerTo();
llvm::Type *FnTyArgs[] = {getIdentTyPointerTy(), CGM.VoidPtrTy, KmpcCtorTy,
KmpcCopyCtorTy, KmpcDtorTy};
auto FnTy = llvm::FunctionType::get(CGM.VoidTy, FnTyArgs,
/*isVarArg*/ false);
RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_threadprivate_register");
break;
}
case OMPRTL__kmpc_end_critical: {
// Build void __kmpc_end_critical(ident_t *loc, kmp_int32 global_tid,
// kmp_critical_name *crit);
llvm::Type *TypeParams[] = {
getIdentTyPointerTy(), CGM.Int32Ty,
llvm::PointerType::getUnqual(KmpCriticalNameTy)};
llvm::FunctionType *FnTy =
llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_end_critical");
break;
}
case OMPRTL__kmpc_cancel_barrier: {
// Build kmp_int32 __kmpc_cancel_barrier(ident_t *loc, kmp_int32
// global_tid);
llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty};
llvm::FunctionType *FnTy =
llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg*/ false);
RTLFn = CGM.CreateRuntimeFunction(FnTy, /*Name*/ "__kmpc_cancel_barrier");
break;
}
case OMPRTL__kmpc_barrier: {
// Build void __kmpc_barrier(ident_t *loc, kmp_int32 global_tid);
llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty};
llvm::FunctionType *FnTy =
llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
RTLFn = CGM.CreateRuntimeFunction(FnTy, /*Name*/ "__kmpc_barrier");
break;
}
case OMPRTL__kmpc_for_static_fini: {
// Build void __kmpc_for_static_fini(ident_t *loc, kmp_int32 global_tid);
llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty};
llvm::FunctionType *FnTy =
llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_for_static_fini");
break;
}
case OMPRTL__kmpc_push_num_threads: {
// Build void __kmpc_push_num_threads(ident_t *loc, kmp_int32 global_tid,
// kmp_int32 num_threads)
llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty,
CGM.Int32Ty};
llvm::FunctionType *FnTy =
llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_push_num_threads");
break;
}
case OMPRTL__kmpc_serialized_parallel: {
// Build void __kmpc_serialized_parallel(ident_t *loc, kmp_int32
// global_tid);
llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty};
llvm::FunctionType *FnTy =
llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_serialized_parallel");
break;
}
case OMPRTL__kmpc_end_serialized_parallel: {
// Build void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32
// global_tid);
llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty};
llvm::FunctionType *FnTy =
llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_end_serialized_parallel");
break;
}
case OMPRTL__kmpc_flush: {
// Build void __kmpc_flush(ident_t *loc);
llvm::Type *TypeParams[] = {getIdentTyPointerTy()};
llvm::FunctionType *FnTy =
llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_flush");
break;
}
case OMPRTL__kmpc_master: {
// Build kmp_int32 __kmpc_master(ident_t *loc, kmp_int32 global_tid);
llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty};
llvm::FunctionType *FnTy =
llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg=*/false);
RTLFn = CGM.CreateRuntimeFunction(FnTy, /*Name=*/"__kmpc_master");
break;
}
case OMPRTL__kmpc_end_master: {
// Build void __kmpc_end_master(ident_t *loc, kmp_int32 global_tid);
llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty};
llvm::FunctionType *FnTy =
llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg=*/false);
RTLFn = CGM.CreateRuntimeFunction(FnTy, /*Name=*/"__kmpc_end_master");
break;
}
case OMPRTL__kmpc_omp_taskyield: {
// Build kmp_int32 __kmpc_omp_taskyield(ident_t *, kmp_int32 global_tid,
// int end_part);
llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty, CGM.IntTy};
llvm::FunctionType *FnTy =
llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg=*/false);
RTLFn = CGM.CreateRuntimeFunction(FnTy, /*Name=*/"__kmpc_omp_taskyield");
break;
}
case OMPRTL__kmpc_single: {
// Build kmp_int32 __kmpc_single(ident_t *loc, kmp_int32 global_tid);
llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty};
llvm::FunctionType *FnTy =
llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg=*/false);
RTLFn = CGM.CreateRuntimeFunction(FnTy, /*Name=*/"__kmpc_single");
break;
}
case OMPRTL__kmpc_end_single: {
// Build void __kmpc_end_single(ident_t *loc, kmp_int32 global_tid);
llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty};
llvm::FunctionType *FnTy =
llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg=*/false);
RTLFn = CGM.CreateRuntimeFunction(FnTy, /*Name=*/"__kmpc_end_single");
break;
}
case OMPRTL__kmpc_omp_task_alloc: {
// Build kmp_task_t *__kmpc_omp_task_alloc(ident_t *, kmp_int32 gtid,
// kmp_int32 flags, size_t sizeof_kmp_task_t, size_t sizeof_shareds,
// kmp_routine_entry_t *task_entry);
assert(KmpRoutineEntryPtrTy != nullptr &&
"Type kmp_routine_entry_t must be created.");
llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty, CGM.Int32Ty,
CGM.SizeTy, CGM.SizeTy, KmpRoutineEntryPtrTy};
// Return void * and then cast to particular kmp_task_t type.
llvm::FunctionType *FnTy =
llvm::FunctionType::get(CGM.VoidPtrTy, TypeParams, /*isVarArg=*/false);
RTLFn = CGM.CreateRuntimeFunction(FnTy, /*Name=*/"__kmpc_omp_task_alloc");
break;
}
case OMPRTL__kmpc_omp_task: {
// Build kmp_int32 __kmpc_omp_task(ident_t *, kmp_int32 gtid, kmp_task_t
// *new_task);
llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty,
CGM.VoidPtrTy};
llvm::FunctionType *FnTy =
llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg=*/false);
RTLFn = CGM.CreateRuntimeFunction(FnTy, /*Name=*/"__kmpc_omp_task");
break;
}
case OMPRTL__kmpc_copyprivate: {
// Build void __kmpc_copyprivate(ident_t *loc, kmp_int32 global_tid,
// size_t cpy_size, void *cpy_data, void(*cpy_func)(void *, void *),
// kmp_int32 didit);
llvm::Type *CpyTypeParams[] = {CGM.VoidPtrTy, CGM.VoidPtrTy};
auto *CpyFnTy =
llvm::FunctionType::get(CGM.VoidTy, CpyTypeParams, /*isVarArg=*/false);
llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty, CGM.SizeTy,
CGM.VoidPtrTy, CpyFnTy->getPointerTo(),
CGM.Int32Ty};
llvm::FunctionType *FnTy =
llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg=*/false);
RTLFn = CGM.CreateRuntimeFunction(FnTy, /*Name=*/"__kmpc_copyprivate");
break;
}
case OMPRTL__kmpc_reduce: {
// Build kmp_int32 __kmpc_reduce(ident_t *loc, kmp_int32 global_tid,
// kmp_int32 num_vars, size_t reduce_size, void *reduce_data, void
// (*reduce_func)(void *lhs_data, void *rhs_data), kmp_critical_name *lck);
llvm::Type *ReduceTypeParams[] = {CGM.VoidPtrTy, CGM.VoidPtrTy};
auto *ReduceFnTy = llvm::FunctionType::get(CGM.VoidTy, ReduceTypeParams,
/*isVarArg=*/false);
llvm::Type *TypeParams[] = {
getIdentTyPointerTy(), CGM.Int32Ty, CGM.Int32Ty, CGM.SizeTy,
CGM.VoidPtrTy, ReduceFnTy->getPointerTo(),
llvm::PointerType::getUnqual(KmpCriticalNameTy)};
llvm::FunctionType *FnTy =
llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg=*/false);
RTLFn = CGM.CreateRuntimeFunction(FnTy, /*Name=*/"__kmpc_reduce");
break;
}
case OMPRTL__kmpc_reduce_nowait: {
// Build kmp_int32 __kmpc_reduce_nowait(ident_t *loc, kmp_int32
// global_tid, kmp_int32 num_vars, size_t reduce_size, void *reduce_data,
// void (*reduce_func)(void *lhs_data, void *rhs_data), kmp_critical_name
// *lck);
llvm::Type *ReduceTypeParams[] = {CGM.VoidPtrTy, CGM.VoidPtrTy};
auto *ReduceFnTy = llvm::FunctionType::get(CGM.VoidTy, ReduceTypeParams,
/*isVarArg=*/false);
llvm::Type *TypeParams[] = {
getIdentTyPointerTy(), CGM.Int32Ty, CGM.Int32Ty, CGM.SizeTy,
CGM.VoidPtrTy, ReduceFnTy->getPointerTo(),
llvm::PointerType::getUnqual(KmpCriticalNameTy)};
llvm::FunctionType *FnTy =
llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg=*/false);
RTLFn = CGM.CreateRuntimeFunction(FnTy, /*Name=*/"__kmpc_reduce_nowait");
break;
}
case OMPRTL__kmpc_end_reduce: {
// Build void __kmpc_end_reduce(ident_t *loc, kmp_int32 global_tid,
// kmp_critical_name *lck);
llvm::Type *TypeParams[] = {
getIdentTyPointerTy(), CGM.Int32Ty,
llvm::PointerType::getUnqual(KmpCriticalNameTy)};
llvm::FunctionType *FnTy =
llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg=*/false);
RTLFn = CGM.CreateRuntimeFunction(FnTy, /*Name=*/"__kmpc_end_reduce");
break;
}
case OMPRTL__kmpc_end_reduce_nowait: {
// Build __kmpc_end_reduce_nowait(ident_t *loc, kmp_int32 global_tid,
// kmp_critical_name *lck);
llvm::Type *TypeParams[] = {
getIdentTyPointerTy(), CGM.Int32Ty,
llvm::PointerType::getUnqual(KmpCriticalNameTy)};
llvm::FunctionType *FnTy =
llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg=*/false);
RTLFn =
CGM.CreateRuntimeFunction(FnTy, /*Name=*/"__kmpc_end_reduce_nowait");
break;
}
case OMPRTL__kmpc_omp_task_begin_if0: {
// Build void __kmpc_omp_task(ident_t *, kmp_int32 gtid, kmp_task_t
// *new_task);
llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty,
CGM.VoidPtrTy};
llvm::FunctionType *FnTy =
llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg=*/false);
RTLFn =
CGM.CreateRuntimeFunction(FnTy, /*Name=*/"__kmpc_omp_task_begin_if0");
break;
}
case OMPRTL__kmpc_omp_task_complete_if0: {
// Build void __kmpc_omp_task(ident_t *, kmp_int32 gtid, kmp_task_t
// *new_task);
llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty,
CGM.VoidPtrTy};
llvm::FunctionType *FnTy =
llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg=*/false);
RTLFn = CGM.CreateRuntimeFunction(FnTy,
/*Name=*/"__kmpc_omp_task_complete_if0");
break;
}
case OMPRTL__kmpc_ordered: {
// Build void __kmpc_ordered(ident_t *loc, kmp_int32 global_tid);
llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty};
llvm::FunctionType *FnTy =
llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg=*/false);
RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_ordered");
break;
}
case OMPRTL__kmpc_end_ordered: {
// Build void __kmpc_end_ordered(ident_t *loc, kmp_int32 global_tid);
llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty};
llvm::FunctionType *FnTy =
llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg=*/false);
RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_end_ordered");
break;
}
case OMPRTL__kmpc_omp_taskwait: {
// Build kmp_int32 __kmpc_omp_taskwait(ident_t *loc, kmp_int32 global_tid);
llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty};
llvm::FunctionType *FnTy =
llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg=*/false);
RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_omp_taskwait");
break;
}
case OMPRTL__kmpc_taskgroup: {
// Build void __kmpc_taskgroup(ident_t *loc, kmp_int32 global_tid);
llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty};
llvm::FunctionType *FnTy =
llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg=*/false);
RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_taskgroup");
break;
}
case OMPRTL__kmpc_end_taskgroup: {
// Build void __kmpc_end_taskgroup(ident_t *loc, kmp_int32 global_tid);
llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty};
llvm::FunctionType *FnTy =
llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg=*/false);
RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_end_taskgroup");
break;
}
case OMPRTL__kmpc_push_proc_bind: {
// Build void __kmpc_push_proc_bind(ident_t *loc, kmp_int32 global_tid,
// int proc_bind)
llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty, CGM.IntTy};
llvm::FunctionType *FnTy =
llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_push_proc_bind");
break;
}
case OMPRTL__kmpc_omp_task_with_deps: {
// Build kmp_int32 __kmpc_omp_task_with_deps(ident_t *, kmp_int32 gtid,
// kmp_task_t *new_task, kmp_int32 ndeps, kmp_depend_info_t *dep_list,
// kmp_int32 ndeps_noalias, kmp_depend_info_t *noalias_dep_list);
llvm::Type *TypeParams[] = {
getIdentTyPointerTy(), CGM.Int32Ty, CGM.VoidPtrTy, CGM.Int32Ty,
CGM.VoidPtrTy, CGM.Int32Ty, CGM.VoidPtrTy};
llvm::FunctionType *FnTy =
llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg=*/false);
RTLFn =
CGM.CreateRuntimeFunction(FnTy, /*Name=*/"__kmpc_omp_task_with_deps");
break;
}
case OMPRTL__kmpc_omp_wait_deps: {
// Build void __kmpc_omp_wait_deps(ident_t *, kmp_int32 gtid,
// kmp_int32 ndeps, kmp_depend_info_t *dep_list, kmp_int32 ndeps_noalias,
// kmp_depend_info_t *noalias_dep_list);
llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty,
CGM.Int32Ty, CGM.VoidPtrTy,
CGM.Int32Ty, CGM.VoidPtrTy};
llvm::FunctionType *FnTy =
llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg=*/false);
RTLFn = CGM.CreateRuntimeFunction(FnTy, /*Name=*/"__kmpc_omp_wait_deps");
break;
}
case OMPRTL__kmpc_cancellationpoint: {
// Build kmp_int32 __kmpc_cancellationpoint(ident_t *loc, kmp_int32
// global_tid, kmp_int32 cncl_kind)
llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty, CGM.IntTy};
llvm::FunctionType *FnTy =
llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg*/ false);
RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_cancellationpoint");
break;
}
case OMPRTL__kmpc_cancel: {
// Build kmp_int32 __kmpc_cancel(ident_t *loc, kmp_int32 global_tid,
// kmp_int32 cncl_kind)
llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty, CGM.IntTy};
llvm::FunctionType *FnTy =
llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg*/ false);
RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_cancel");
break;
}
case OMPRTL__kmpc_push_num_teams: {
// Build void kmpc_push_num_teams (ident_t loc, kmp_int32 global_tid,
// kmp_int32 num_teams, kmp_int32 num_threads)
llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty, CGM.Int32Ty,
CGM.Int32Ty};
llvm::FunctionType *FnTy =
llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg*/ false);
RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_push_num_teams");
break;
}
case OMPRTL__kmpc_fork_teams: {
// Build void __kmpc_fork_teams(ident_t *loc, kmp_int32 argc, kmpc_micro
// microtask, ...);
llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty,
getKmpc_MicroPointerTy()};
llvm::FunctionType *FnTy =
llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ true);
RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_fork_teams");
break;
}
case OMPRTL__kmpc_taskloop: {
// Build void __kmpc_taskloop(ident_t *loc, int gtid, kmp_task_t *task, int
// if_val, kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st, int nogroup, int
// sched, kmp_uint64 grainsize, void *task_dup);
llvm::Type *TypeParams[] = {getIdentTyPointerTy(),
CGM.IntTy,
CGM.VoidPtrTy,
CGM.IntTy,
CGM.Int64Ty->getPointerTo(),
CGM.Int64Ty->getPointerTo(),
CGM.Int64Ty,
CGM.IntTy,
CGM.IntTy,
CGM.Int64Ty,
CGM.VoidPtrTy};
llvm::FunctionType *FnTy =
llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg=*/false);
RTLFn = CGM.CreateRuntimeFunction(FnTy, /*Name=*/"__kmpc_taskloop");
break;
}
case OMPRTL__kmpc_doacross_init: {
// Build void __kmpc_doacross_init(ident_t *loc, kmp_int32 gtid, kmp_int32
// num_dims, struct kmp_dim *dims);
llvm::Type *TypeParams[] = {getIdentTyPointerTy(),
CGM.Int32Ty,
CGM.Int32Ty,
CGM.VoidPtrTy};
llvm::FunctionType *FnTy =
llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg=*/false);
RTLFn = CGM.CreateRuntimeFunction(FnTy, /*Name=*/"__kmpc_doacross_init");
break;
}
case OMPRTL__kmpc_doacross_fini: {
// Build void __kmpc_doacross_fini(ident_t *loc, kmp_int32 gtid);
llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty};
llvm::FunctionType *FnTy =
llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg=*/false);
RTLFn = CGM.CreateRuntimeFunction(FnTy, /*Name=*/"__kmpc_doacross_fini");
break;
}
case OMPRTL__kmpc_doacross_post: {
// Build void __kmpc_doacross_post(ident_t *loc, kmp_int32 gtid, kmp_int64
// *vec);
llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty,
CGM.Int64Ty->getPointerTo()};
llvm::FunctionType *FnTy =
llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg=*/false);
RTLFn = CGM.CreateRuntimeFunction(FnTy, /*Name=*/"__kmpc_doacross_post");
break;
}
case OMPRTL__kmpc_doacross_wait: {
// Build void __kmpc_doacross_wait(ident_t *loc, kmp_int32 gtid, kmp_int64
// *vec);
llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty,
CGM.Int64Ty->getPointerTo()};
llvm::FunctionType *FnTy =
llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg=*/false);
RTLFn = CGM.CreateRuntimeFunction(FnTy, /*Name=*/"__kmpc_doacross_wait");
break;
}
case OMPRTL__tgt_target: {
// Build int32_t __tgt_target(int32_t device_id, void *host_ptr, int32_t
// arg_num, void** args_base, void **args, size_t *arg_sizes, int32_t
// *arg_types);
llvm::Type *TypeParams[] = {CGM.Int32Ty,
CGM.VoidPtrTy,
CGM.Int32Ty,
CGM.VoidPtrPtrTy,
CGM.VoidPtrPtrTy,
CGM.SizeTy->getPointerTo(),
CGM.Int32Ty->getPointerTo()};
llvm::FunctionType *FnTy =
llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg*/ false);
RTLFn = CGM.CreateRuntimeFunction(FnTy, "__tgt_target");
break;
}
case OMPRTL__tgt_target_teams: {
// Build int32_t __tgt_target_teams(int32_t device_id, void *host_ptr,
// int32_t arg_num, void** args_base, void **args, size_t *arg_sizes,
// int32_t *arg_types, int32_t num_teams, int32_t thread_limit);
llvm::Type *TypeParams[] = {CGM.Int32Ty,
CGM.VoidPtrTy,
CGM.Int32Ty,
CGM.VoidPtrPtrTy,
CGM.VoidPtrPtrTy,
CGM.SizeTy->getPointerTo(),
CGM.Int32Ty->getPointerTo(),
CGM.Int32Ty,
CGM.Int32Ty};
llvm::FunctionType *FnTy =
llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg*/ false);
RTLFn = CGM.CreateRuntimeFunction(FnTy, "__tgt_target_teams");
break;
}
case OMPRTL__tgt_register_lib: {
// Build void __tgt_register_lib(__tgt_bin_desc *desc);
QualType ParamTy =
CGM.getContext().getPointerType(getTgtBinaryDescriptorQTy());
llvm::Type *TypeParams[] = {CGM.getTypes().ConvertTypeForMem(ParamTy)};
llvm::FunctionType *FnTy =
llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg*/ false);
RTLFn = CGM.CreateRuntimeFunction(FnTy, "__tgt_register_lib");
break;
}
case OMPRTL__tgt_unregister_lib: {
// Build void __tgt_unregister_lib(__tgt_bin_desc *desc);
QualType ParamTy =
CGM.getContext().getPointerType(getTgtBinaryDescriptorQTy());
llvm::Type *TypeParams[] = {CGM.getTypes().ConvertTypeForMem(ParamTy)};
llvm::FunctionType *FnTy =
llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg*/ false);
RTLFn = CGM.CreateRuntimeFunction(FnTy, "__tgt_unregister_lib");
break;
}
case OMPRTL__tgt_target_data_begin: {
// Build void __tgt_target_data_begin(int32_t device_id, int32_t arg_num,
// void** args_base, void **args, size_t *arg_sizes, int32_t *arg_types);
llvm::Type *TypeParams[] = {CGM.Int32Ty,
CGM.Int32Ty,
CGM.VoidPtrPtrTy,
CGM.VoidPtrPtrTy,
CGM.SizeTy->getPointerTo(),
CGM.Int32Ty->getPointerTo()};
llvm::FunctionType *FnTy =
llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
RTLFn = CGM.CreateRuntimeFunction(FnTy, "__tgt_target_data_begin");
break;
}
case OMPRTL__tgt_target_data_end: {
// Build void __tgt_target_data_end(int32_t device_id, int32_t arg_num,
// void** args_base, void **args, size_t *arg_sizes, int32_t *arg_types);
llvm::Type *TypeParams[] = {CGM.Int32Ty,
CGM.Int32Ty,
CGM.VoidPtrPtrTy,
CGM.VoidPtrPtrTy,
CGM.SizeTy->getPointerTo(),
CGM.Int32Ty->getPointerTo()};
llvm::FunctionType *FnTy =
llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
RTLFn = CGM.CreateRuntimeFunction(FnTy, "__tgt_target_data_end");
break;
}
case OMPRTL__tgt_target_data_update: {
// Build void __tgt_target_data_update(int32_t device_id, int32_t arg_num,
// void** args_base, void **args, size_t *arg_sizes, int32_t *arg_types);
llvm::Type *TypeParams[] = {CGM.Int32Ty,
CGM.Int32Ty,
CGM.VoidPtrPtrTy,
CGM.VoidPtrPtrTy,
CGM.SizeTy->getPointerTo(),
CGM.Int32Ty->getPointerTo()};
llvm::FunctionType *FnTy =
llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
RTLFn = CGM.CreateRuntimeFunction(FnTy, "__tgt_target_data_update");
break;
}
}
assert(RTLFn && "Unable to find OpenMP runtime function");
return RTLFn;
}
llvm::Constant *CGOpenMPRuntime::createForStaticInitFunction(unsigned IVSize,
bool IVSigned) {
assert((IVSize == 32 || IVSize == 64) &&
"IV size is not compatible with the omp runtime");
auto Name = IVSize == 32 ? (IVSigned ? "__kmpc_for_static_init_4"
: "__kmpc_for_static_init_4u")
: (IVSigned ? "__kmpc_for_static_init_8"
: "__kmpc_for_static_init_8u");
auto ITy = IVSize == 32 ? CGM.Int32Ty : CGM.Int64Ty;
auto PtrTy = llvm::PointerType::getUnqual(ITy);
llvm::Type *TypeParams[] = {
getIdentTyPointerTy(), // loc
CGM.Int32Ty, // tid
CGM.Int32Ty, // schedtype
llvm::PointerType::getUnqual(CGM.Int32Ty), // p_lastiter
PtrTy, // p_lower
PtrTy, // p_upper
PtrTy, // p_stride
ITy, // incr
ITy // chunk
};
llvm::FunctionType *FnTy =
llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
return CGM.CreateRuntimeFunction(FnTy, Name);
}
llvm::Constant *CGOpenMPRuntime::createDispatchInitFunction(unsigned IVSize,
bool IVSigned) {
assert((IVSize == 32 || IVSize == 64) &&
"IV size is not compatible with the omp runtime");
auto Name =
IVSize == 32
? (IVSigned ? "__kmpc_dispatch_init_4" : "__kmpc_dispatch_init_4u")
: (IVSigned ? "__kmpc_dispatch_init_8" : "__kmpc_dispatch_init_8u");
auto ITy = IVSize == 32 ? CGM.Int32Ty : CGM.Int64Ty;
llvm::Type *TypeParams[] = { getIdentTyPointerTy(), // loc
CGM.Int32Ty, // tid
CGM.Int32Ty, // schedtype
ITy, // lower
ITy, // upper
ITy, // stride
ITy // chunk
};
llvm::FunctionType *FnTy =
llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
return CGM.CreateRuntimeFunction(FnTy, Name);
}
llvm::Constant *CGOpenMPRuntime::createDispatchFiniFunction(unsigned IVSize,
bool IVSigned) {
assert((IVSize == 32 || IVSize == 64) &&
"IV size is not compatible with the omp runtime");
auto Name =
IVSize == 32
? (IVSigned ? "__kmpc_dispatch_fini_4" : "__kmpc_dispatch_fini_4u")
: (IVSigned ? "__kmpc_dispatch_fini_8" : "__kmpc_dispatch_fini_8u");
llvm::Type *TypeParams[] = {
getIdentTyPointerTy(), // loc
CGM.Int32Ty, // tid
};
llvm::FunctionType *FnTy =
llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg=*/false);
return CGM.CreateRuntimeFunction(FnTy, Name);
}
llvm::Constant *CGOpenMPRuntime::createDispatchNextFunction(unsigned IVSize,
bool IVSigned) {
assert((IVSize == 32 || IVSize == 64) &&
"IV size is not compatible with the omp runtime");
auto Name =
IVSize == 32
? (IVSigned ? "__kmpc_dispatch_next_4" : "__kmpc_dispatch_next_4u")
: (IVSigned ? "__kmpc_dispatch_next_8" : "__kmpc_dispatch_next_8u");
auto ITy = IVSize == 32 ? CGM.Int32Ty : CGM.Int64Ty;
auto PtrTy = llvm::PointerType::getUnqual(ITy);
llvm::Type *TypeParams[] = {
getIdentTyPointerTy(), // loc
CGM.Int32Ty, // tid
llvm::PointerType::getUnqual(CGM.Int32Ty), // p_lastiter
PtrTy, // p_lower
PtrTy, // p_upper
PtrTy // p_stride
};
llvm::FunctionType *FnTy =
llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg*/ false);
return CGM.CreateRuntimeFunction(FnTy, Name);
}
llvm::Constant *
CGOpenMPRuntime::getOrCreateThreadPrivateCache(const VarDecl *VD) {
assert(!CGM.getLangOpts().OpenMPUseTLS ||
!CGM.getContext().getTargetInfo().isTLSSupported());
// Lookup the entry, lazily creating it if necessary.
return getOrCreateInternalVariable(CGM.Int8PtrPtrTy,
Twine(CGM.getMangledName(VD)) + ".cache.");
}
Address CGOpenMPRuntime::getAddrOfThreadPrivate(CodeGenFunction &CGF,
const VarDecl *VD,
Address VDAddr,
SourceLocation Loc) {
if (CGM.getLangOpts().OpenMPUseTLS &&
CGM.getContext().getTargetInfo().isTLSSupported())
return VDAddr;
auto VarTy = VDAddr.getElementType();
llvm::Value *Args[] = {emitUpdateLocation(CGF, Loc), getThreadID(CGF, Loc),
CGF.Builder.CreatePointerCast(VDAddr.getPointer(),
CGM.Int8PtrTy),
CGM.getSize(CGM.GetTargetTypeStoreSize(VarTy)),
getOrCreateThreadPrivateCache(VD)};
return Address(CGF.EmitRuntimeCall(
createRuntimeFunction(OMPRTL__kmpc_threadprivate_cached), Args),
VDAddr.getAlignment());
}
void CGOpenMPRuntime::emitThreadPrivateVarInit(
CodeGenFunction &CGF, Address VDAddr, llvm::Value *Ctor,
llvm::Value *CopyCtor, llvm::Value *Dtor, SourceLocation Loc) {
// Call kmp_int32 __kmpc_global_thread_num(&loc) to init OpenMP runtime
// library.
auto OMPLoc = emitUpdateLocation(CGF, Loc);
CGF.EmitRuntimeCall(createRuntimeFunction(OMPRTL__kmpc_global_thread_num),
OMPLoc);
// Call __kmpc_threadprivate_register(&loc, &var, ctor, cctor/*NULL*/, dtor)
// to register constructor/destructor for variable.
llvm::Value *Args[] = {OMPLoc,
CGF.Builder.CreatePointerCast(VDAddr.getPointer(),
CGM.VoidPtrTy),
Ctor, CopyCtor, Dtor};
CGF.EmitRuntimeCall(
createRuntimeFunction(OMPRTL__kmpc_threadprivate_register), Args);
}
llvm::Function *CGOpenMPRuntime::emitThreadPrivateVarDefinition(
const VarDecl *VD, Address VDAddr, SourceLocation Loc,
bool PerformInit, CodeGenFunction *CGF) {
if (CGM.getLangOpts().OpenMPUseTLS &&
CGM.getContext().getTargetInfo().isTLSSupported())
return nullptr;
VD = VD->getDefinition(CGM.getContext());
if (VD && ThreadPrivateWithDefinition.count(VD) == 0) {
ThreadPrivateWithDefinition.insert(VD);
QualType ASTTy = VD->getType();
llvm::Value *Ctor = nullptr, *CopyCtor = nullptr, *Dtor = nullptr;
auto Init = VD->getAnyInitializer();
if (CGM.getLangOpts().CPlusPlus && PerformInit) {
// Generate function that re-emits the declaration's initializer into the
// threadprivate copy of the variable VD
CodeGenFunction CtorCGF(CGM);
FunctionArgList Args;
ImplicitParamDecl Dst(CGM.getContext(), /*DC=*/nullptr, SourceLocation(),
/*Id=*/nullptr, CGM.getContext().VoidPtrTy);
Args.push_back(&Dst);
auto &FI = CGM.getTypes().arrangeBuiltinFunctionDeclaration(
CGM.getContext().VoidPtrTy, Args);
auto FTy = CGM.getTypes().GetFunctionType(FI);
auto Fn = CGM.CreateGlobalInitOrDestructFunction(
FTy, ".__kmpc_global_ctor_.", FI, Loc);
CtorCGF.StartFunction(GlobalDecl(), CGM.getContext().VoidPtrTy, Fn, FI,
Args, SourceLocation());
auto ArgVal = CtorCGF.EmitLoadOfScalar(
CtorCGF.GetAddrOfLocalVar(&Dst), /*Volatile=*/false,
CGM.getContext().VoidPtrTy, Dst.getLocation());
Address Arg = Address(ArgVal, VDAddr.getAlignment());
Arg = CtorCGF.Builder.CreateElementBitCast(Arg,
CtorCGF.ConvertTypeForMem(ASTTy));
CtorCGF.EmitAnyExprToMem(Init, Arg, Init->getType().getQualifiers(),
/*IsInitializer=*/true);
ArgVal = CtorCGF.EmitLoadOfScalar(
CtorCGF.GetAddrOfLocalVar(&Dst), /*Volatile=*/false,
CGM.getContext().VoidPtrTy, Dst.getLocation());
CtorCGF.Builder.CreateStore(ArgVal, CtorCGF.ReturnValue);
CtorCGF.FinishFunction();
Ctor = Fn;
}
if (VD->getType().isDestructedType() != QualType::DK_none) {
// Generate function that emits destructor call for the threadprivate copy
// of the variable VD
CodeGenFunction DtorCGF(CGM);
FunctionArgList Args;
ImplicitParamDecl Dst(CGM.getContext(), /*DC=*/nullptr, SourceLocation(),
/*Id=*/nullptr, CGM.getContext().VoidPtrTy);
Args.push_back(&Dst);
auto &FI = CGM.getTypes().arrangeBuiltinFunctionDeclaration(
CGM.getContext().VoidTy, Args);
auto FTy = CGM.getTypes().GetFunctionType(FI);
auto Fn = CGM.CreateGlobalInitOrDestructFunction(
FTy, ".__kmpc_global_dtor_.", FI, Loc);
auto NL = ApplyDebugLocation::CreateEmpty(DtorCGF);
DtorCGF.StartFunction(GlobalDecl(), CGM.getContext().VoidTy, Fn, FI, Args,
SourceLocation());
// Create a scope with an artificial location for the body of this function.
auto AL = ApplyDebugLocation::CreateArtificial(DtorCGF);
auto ArgVal = DtorCGF.EmitLoadOfScalar(
DtorCGF.GetAddrOfLocalVar(&Dst),
/*Volatile=*/false, CGM.getContext().VoidPtrTy, Dst.getLocation());
DtorCGF.emitDestroy(Address(ArgVal, VDAddr.getAlignment()), ASTTy,
DtorCGF.getDestroyer(ASTTy.isDestructedType()),
DtorCGF.needsEHCleanup(ASTTy.isDestructedType()));
DtorCGF.FinishFunction();
Dtor = Fn;
}
// Do not emit init function if it is not required.
if (!Ctor && !Dtor)
return nullptr;
llvm::Type *CopyCtorTyArgs[] = {CGM.VoidPtrTy, CGM.VoidPtrTy};
auto CopyCtorTy =
llvm::FunctionType::get(CGM.VoidPtrTy, CopyCtorTyArgs,
/*isVarArg=*/false)->getPointerTo();
// Copying constructor for the threadprivate variable.
// Must be NULL - reserved by runtime, but currently it requires that this
// parameter is always NULL. Otherwise it fires assertion.
CopyCtor = llvm::Constant::getNullValue(CopyCtorTy);
if (Ctor == nullptr) {
auto CtorTy = llvm::FunctionType::get(CGM.VoidPtrTy, CGM.VoidPtrTy,
/*isVarArg=*/false)->getPointerTo();
Ctor = llvm::Constant::getNullValue(CtorTy);
}
if (Dtor == nullptr) {
auto DtorTy = llvm::FunctionType::get(CGM.VoidTy, CGM.VoidPtrTy,
/*isVarArg=*/false)->getPointerTo();
Dtor = llvm::Constant::getNullValue(DtorTy);
}
if (!CGF) {
auto InitFunctionTy =
llvm::FunctionType::get(CGM.VoidTy, /*isVarArg*/ false);
auto InitFunction = CGM.CreateGlobalInitOrDestructFunction(
InitFunctionTy, ".__omp_threadprivate_init_.",
CGM.getTypes().arrangeNullaryFunction());
CodeGenFunction InitCGF(CGM);
FunctionArgList ArgList;
InitCGF.StartFunction(GlobalDecl(), CGM.getContext().VoidTy, InitFunction,
CGM.getTypes().arrangeNullaryFunction(), ArgList,
Loc);
emitThreadPrivateVarInit(InitCGF, VDAddr, Ctor, CopyCtor, Dtor, Loc);
InitCGF.FinishFunction();
return InitFunction;
}
emitThreadPrivateVarInit(*CGF, VDAddr, Ctor, CopyCtor, Dtor, Loc);
}
return nullptr;
}
/// \brief Emits code for OpenMP 'if' clause using specified \a CodeGen
/// function. Here is the logic:
/// if (Cond) {
/// ThenGen();
/// } else {
/// ElseGen();
/// }
void CGOpenMPRuntime::emitOMPIfClause(CodeGenFunction &CGF, const Expr *Cond,
const RegionCodeGenTy &ThenGen,
const RegionCodeGenTy &ElseGen) {
CodeGenFunction::LexicalScope ConditionScope(CGF, Cond->getSourceRange());
// If the condition constant folds and can be elided, try to avoid emitting
// the condition and the dead arm of the if/else.
bool CondConstant;
if (CGF.ConstantFoldsToSimpleInteger(Cond, CondConstant)) {
if (CondConstant)
ThenGen(CGF);
else
ElseGen(CGF);
return;
}
// Otherwise, the condition did not fold, or we couldn't elide it. Just
// emit the conditional branch.
auto ThenBlock = CGF.createBasicBlock("omp_if.then");
auto ElseBlock = CGF.createBasicBlock("omp_if.else");
auto ContBlock = CGF.createBasicBlock("omp_if.end");
CGF.EmitBranchOnBoolExpr(Cond, ThenBlock, ElseBlock, /*TrueCount=*/0);
// Emit the 'then' code.
CGF.EmitBlock(ThenBlock);
ThenGen(CGF);
CGF.EmitBranch(ContBlock);
// Emit the 'else' code if present.
// There is no need to emit line number for unconditional branch.
(void)ApplyDebugLocation::CreateEmpty(CGF);
CGF.EmitBlock(ElseBlock);
ElseGen(CGF);
// There is no need to emit line number for unconditional branch.
(void)ApplyDebugLocation::CreateEmpty(CGF);
CGF.EmitBranch(ContBlock);
// Emit the continuation block for code after the if.
CGF.EmitBlock(ContBlock, /*IsFinished=*/true);
}
void CGOpenMPRuntime::emitParallelCall(CodeGenFunction &CGF, SourceLocation Loc,
llvm::Value *OutlinedFn,
ArrayRef<llvm::Value *> CapturedVars,
const Expr *IfCond) {
if (!CGF.HaveInsertPoint())
return;
auto *RTLoc = emitUpdateLocation(CGF, Loc);
auto &&ThenGen = [OutlinedFn, CapturedVars, RTLoc](CodeGenFunction &CGF,
PrePostActionTy &) {
// Build call __kmpc_fork_call(loc, n, microtask, var1, .., varn);
auto &RT = CGF.CGM.getOpenMPRuntime();
llvm::Value *Args[] = {
RTLoc,
CGF.Builder.getInt32(CapturedVars.size()), // Number of captured vars
CGF.Builder.CreateBitCast(OutlinedFn, RT.getKmpc_MicroPointerTy())};
llvm::SmallVector<llvm::Value *, 16> RealArgs;
RealArgs.append(std::begin(Args), std::end(Args));
RealArgs.append(CapturedVars.begin(), CapturedVars.end());
auto RTLFn = RT.createRuntimeFunction(OMPRTL__kmpc_fork_call);
CGF.EmitRuntimeCall(RTLFn, RealArgs);
};
auto &&ElseGen = [OutlinedFn, CapturedVars, RTLoc, Loc](CodeGenFunction &CGF,
PrePostActionTy &) {
auto &RT = CGF.CGM.getOpenMPRuntime();
auto ThreadID = RT.getThreadID(CGF, Loc);
// Build calls:
// __kmpc_serialized_parallel(&Loc, GTid);
llvm::Value *Args[] = {RTLoc, ThreadID};
CGF.EmitRuntimeCall(
RT.createRuntimeFunction(OMPRTL__kmpc_serialized_parallel), Args);
// OutlinedFn(&GTid, &zero, CapturedStruct);
auto ThreadIDAddr = RT.emitThreadIDAddress(CGF, Loc);
Address ZeroAddr =
CGF.CreateTempAlloca(CGF.Int32Ty, CharUnits::fromQuantity(4),
/*Name*/ ".zero.addr");
CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0));
llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
OutlinedFnArgs.push_back(ThreadIDAddr.getPointer());
OutlinedFnArgs.push_back(ZeroAddr.getPointer());
OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());
CGF.EmitCallOrInvoke(OutlinedFn, OutlinedFnArgs);
// __kmpc_end_serialized_parallel(&Loc, GTid);
llvm::Value *EndArgs[] = {RT.emitUpdateLocation(CGF, Loc), ThreadID};
CGF.EmitRuntimeCall(
RT.createRuntimeFunction(OMPRTL__kmpc_end_serialized_parallel),
EndArgs);
};
if (IfCond)
emitOMPIfClause(CGF, IfCond, ThenGen, ElseGen);
else {
RegionCodeGenTy ThenRCG(ThenGen);
ThenRCG(CGF);
}
}
// If we're inside an (outlined) parallel region, use the region info's
// thread-ID variable (it is passed in a first argument of the outlined function
// as "kmp_int32 *gtid"). Otherwise, if we're not inside parallel region, but in
// regular serial code region, get thread ID by calling kmp_int32
// kmpc_global_thread_num(ident_t *loc), stash this thread ID in a temporary and
// return the address of that temp.
Address CGOpenMPRuntime::emitThreadIDAddress(CodeGenFunction &CGF,
SourceLocation Loc) {
if (auto *OMPRegionInfo =
dyn_cast_or_null<CGOpenMPRegionInfo>(CGF.CapturedStmtInfo))
if (OMPRegionInfo->getThreadIDVariable())
return OMPRegionInfo->getThreadIDVariableLValue(CGF).getAddress();
auto ThreadID = getThreadID(CGF, Loc);
auto Int32Ty =
CGF.getContext().getIntTypeForBitwidth(/*DestWidth*/ 32, /*Signed*/ true);
auto ThreadIDTemp = CGF.CreateMemTemp(Int32Ty, /*Name*/ ".threadid_temp.");
CGF.EmitStoreOfScalar(ThreadID,
CGF.MakeAddrLValue(ThreadIDTemp, Int32Ty));
return ThreadIDTemp;
}
llvm::Constant *
CGOpenMPRuntime::getOrCreateInternalVariable(llvm::Type *Ty,
const llvm::Twine &Name) {
SmallString<256> Buffer;
llvm::raw_svector_ostream Out(Buffer);
Out << Name;
auto RuntimeName = Out.str();
auto &Elem = *InternalVars.insert(std::make_pair(RuntimeName, nullptr)).first;
if (Elem.second) {
assert(Elem.second->getType()->getPointerElementType() == Ty &&
"OMP internal variable has different type than requested");
return &*Elem.second;
}
return Elem.second = new llvm::GlobalVariable(
CGM.getModule(), Ty, /*IsConstant*/ false,
llvm::GlobalValue::CommonLinkage, llvm::Constant::getNullValue(Ty),
Elem.first());
}
llvm::Value *CGOpenMPRuntime::getCriticalRegionLock(StringRef CriticalName) {
llvm::Twine Name(".gomp_critical_user_", CriticalName);
return getOrCreateInternalVariable(KmpCriticalNameTy, Name.concat(".var"));
}
namespace {
/// Common pre(post)-action for different OpenMP constructs.
class CommonActionTy final : public PrePostActionTy {
llvm::Value *EnterCallee;
ArrayRef<llvm::Value *> EnterArgs;
llvm::Value *ExitCallee;
ArrayRef<llvm::Value *> ExitArgs;
bool Conditional;
llvm::BasicBlock *ContBlock = nullptr;
public:
CommonActionTy(llvm::Value *EnterCallee, ArrayRef<llvm::Value *> EnterArgs,
llvm::Value *ExitCallee, ArrayRef<llvm::Value *> ExitArgs,
bool Conditional = false)
: EnterCallee(EnterCallee), EnterArgs(EnterArgs), ExitCallee(ExitCallee),
ExitArgs(ExitArgs), Conditional(Conditional) {}
void Enter(CodeGenFunction &CGF) override {
llvm::Value *EnterRes = CGF.EmitRuntimeCall(EnterCallee, EnterArgs);
if (Conditional) {
llvm::Value *CallBool = CGF.Builder.CreateIsNotNull(EnterRes);
auto *ThenBlock = CGF.createBasicBlock("omp_if.then");
ContBlock = CGF.createBasicBlock("omp_if.end");
// Generate the branch (If-stmt)
CGF.Builder.CreateCondBr(CallBool, ThenBlock, ContBlock);
CGF.EmitBlock(ThenBlock);
}
}
void Done(CodeGenFunction &CGF) {
// Emit the rest of blocks/branches
CGF.EmitBranch(ContBlock);
CGF.EmitBlock(ContBlock, true);
}
void Exit(CodeGenFunction &CGF) override {
CGF.EmitRuntimeCall(ExitCallee, ExitArgs);
}
};
} // anonymous namespace
void CGOpenMPRuntime::emitCriticalRegion(CodeGenFunction &CGF,
StringRef CriticalName,
const RegionCodeGenTy &CriticalOpGen,
SourceLocation Loc, const Expr *Hint) {
// __kmpc_critical[_with_hint](ident_t *, gtid, Lock[, hint]);
// CriticalOpGen();
// __kmpc_end_critical(ident_t *, gtid, Lock);
// Prepare arguments and build a call to __kmpc_critical
if (!CGF.HaveInsertPoint())
return;
llvm::Value *Args[] = {emitUpdateLocation(CGF, Loc), getThreadID(CGF, Loc),
getCriticalRegionLock(CriticalName)};
llvm::SmallVector<llvm::Value *, 4> EnterArgs(std::begin(Args),
std::end(Args));
if (Hint) {
EnterArgs.push_back(CGF.Builder.CreateIntCast(
CGF.EmitScalarExpr(Hint), CGM.IntPtrTy, /*isSigned=*/false));
}
CommonActionTy Action(
createRuntimeFunction(Hint ? OMPRTL__kmpc_critical_with_hint
: OMPRTL__kmpc_critical),
EnterArgs, createRuntimeFunction(OMPRTL__kmpc_end_critical), Args);
CriticalOpGen.setAction(Action);
emitInlinedDirective(CGF, OMPD_critical, CriticalOpGen);
}
void CGOpenMPRuntime::emitMasterRegion(CodeGenFunction &CGF,
const RegionCodeGenTy &MasterOpGen,
SourceLocation Loc) {
if (!CGF.HaveInsertPoint())
return;
// if(__kmpc_master(ident_t *, gtid)) {
// MasterOpGen();
// __kmpc_end_master(ident_t *, gtid);
// }
// Prepare arguments and build a call to __kmpc_master
llvm::Value *Args[] = {emitUpdateLocation(CGF, Loc), getThreadID(CGF, Loc)};
CommonActionTy Action(createRuntimeFunction(OMPRTL__kmpc_master), Args,
createRuntimeFunction(OMPRTL__kmpc_end_master), Args,
/*Conditional=*/true);
MasterOpGen.setAction(Action);
emitInlinedDirective(CGF, OMPD_master, MasterOpGen);
Action.Done(CGF);
}
void CGOpenMPRuntime::emitTaskyieldCall(CodeGenFunction &CGF,
SourceLocation Loc) {
if (!CGF.HaveInsertPoint())
return;
// Build call __kmpc_omp_taskyield(loc, thread_id, 0);
llvm::Value *Args[] = {
emitUpdateLocation(CGF, Loc), getThreadID(CGF, Loc),
llvm::ConstantInt::get(CGM.IntTy, /*V=*/0, /*isSigned=*/true)};
CGF.EmitRuntimeCall(createRuntimeFunction(OMPRTL__kmpc_omp_taskyield), Args);
if (auto *Region = dyn_cast_or_null<CGOpenMPRegionInfo>(CGF.CapturedStmtInfo))
Region->emitUntiedSwitch(CGF);
}
void CGOpenMPRuntime::emitTaskgroupRegion(CodeGenFunction &CGF,
const RegionCodeGenTy &TaskgroupOpGen,
SourceLocation Loc) {
if (!CGF.HaveInsertPoint())
return;
// __kmpc_taskgroup(ident_t *, gtid);
// TaskgroupOpGen();
// __kmpc_end_taskgroup(ident_t *, gtid);
// Prepare arguments and build a call to __kmpc_taskgroup
llvm::Value *Args[] = {emitUpdateLocation(CGF, Loc), getThreadID(CGF, Loc)};
CommonActionTy Action(createRuntimeFunction(OMPRTL__kmpc_taskgroup), Args,
createRuntimeFunction(OMPRTL__kmpc_end_taskgroup),
Args);
TaskgroupOpGen.setAction(Action);
emitInlinedDirective(CGF, OMPD_taskgroup, TaskgroupOpGen);
}
/// Given an array of pointers to variables, project the address of a
/// given variable.
static Address emitAddrOfVarFromArray(CodeGenFunction &CGF, Address Array,
unsigned Index, const VarDecl *Var) {
// Pull out the pointer to the variable.
Address PtrAddr =
CGF.Builder.CreateConstArrayGEP(Array, Index, CGF.getPointerSize());
llvm::Value *Ptr = CGF.Builder.CreateLoad(PtrAddr);
Address Addr = Address(Ptr, CGF.getContext().getDeclAlign(Var));
Addr = CGF.Builder.CreateElementBitCast(
Addr, CGF.ConvertTypeForMem(Var->getType()));
return Addr;
}
static llvm::Value *emitCopyprivateCopyFunction(
CodeGenModule &CGM, llvm::Type *ArgsType,
ArrayRef<const Expr *> CopyprivateVars, ArrayRef<const Expr *> DestExprs,
ArrayRef<const Expr *> SrcExprs, ArrayRef<const Expr *> AssignmentOps) {
auto &C = CGM.getContext();
// void copy_func(void *LHSArg, void *RHSArg);
FunctionArgList Args;
ImplicitParamDecl LHSArg(C, /*DC=*/nullptr, SourceLocation(), /*Id=*/nullptr,
C.VoidPtrTy);
ImplicitParamDecl RHSArg(C, /*DC=*/nullptr, SourceLocation(), /*Id=*/nullptr,
C.VoidPtrTy);
Args.push_back(&LHSArg);
Args.push_back(&RHSArg);
auto &CGFI = CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
auto *Fn = llvm::Function::Create(
CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
".omp.copyprivate.copy_func", &CGM.getModule());
CGM.SetInternalFunctionAttributes(/*D=*/nullptr, Fn, CGFI);
CodeGenFunction CGF(CGM);
CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args);
// Dest = (void*[n])(LHSArg);
// Src = (void*[n])(RHSArg);
Address LHS(CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
CGF.Builder.CreateLoad(CGF.GetAddrOfLocalVar(&LHSArg)),
ArgsType), CGF.getPointerAlign());
Address RHS(CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
CGF.Builder.CreateLoad(CGF.GetAddrOfLocalVar(&RHSArg)),
ArgsType), CGF.getPointerAlign());
// *(Type0*)Dst[0] = *(Type0*)Src[0];
// *(Type1*)Dst[1] = *(Type1*)Src[1];
// ...
// *(Typen*)Dst[n] = *(Typen*)Src[n];
for (unsigned I = 0, E = AssignmentOps.size(); I < E; ++I) {
auto DestVar = cast<VarDecl>(cast<DeclRefExpr>(DestExprs[I])->getDecl());
Address DestAddr = emitAddrOfVarFromArray(CGF, LHS, I, DestVar);
auto SrcVar = cast<VarDecl>(cast<DeclRefExpr>(SrcExprs[I])->getDecl());
Address SrcAddr = emitAddrOfVarFromArray(CGF, RHS, I, SrcVar);
auto *VD = cast<DeclRefExpr>(CopyprivateVars[I])->getDecl();
QualType Type = VD->getType();
CGF.EmitOMPCopy(Type, DestAddr, SrcAddr, DestVar, SrcVar, AssignmentOps[I]);
}
CGF.FinishFunction();
return Fn;
}
void CGOpenMPRuntime::emitSingleRegion(CodeGenFunction &CGF,
const RegionCodeGenTy &SingleOpGen,
SourceLocation Loc,
ArrayRef<const Expr *> CopyprivateVars,
ArrayRef<const Expr *> SrcExprs,
ArrayRef<const Expr *> DstExprs,
ArrayRef<const Expr *> AssignmentOps) {
if (!CGF.HaveInsertPoint())
return;
assert(CopyprivateVars.size() == SrcExprs.size() &&
CopyprivateVars.size() == DstExprs.size() &&
CopyprivateVars.size() == AssignmentOps.size());
auto &C = CGM.getContext();
// int32 did_it = 0;
// if(__kmpc_single(ident_t *, gtid)) {
// SingleOpGen();
// __kmpc_end_single(ident_t *, gtid);
// did_it = 1;
// }
// call __kmpc_copyprivate(ident_t *, gtid, <buf_size>, <copyprivate list>,
// <copy_func>, did_it);
Address DidIt = Address::invalid();
if (!CopyprivateVars.empty()) {
// int32 did_it = 0;
auto KmpInt32Ty = C.getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/1);
DidIt = CGF.CreateMemTemp(KmpInt32Ty, ".omp.copyprivate.did_it");
CGF.Builder.CreateStore(CGF.Builder.getInt32(0), DidIt);
}
// Prepare arguments and build a call to __kmpc_single
llvm::Value *Args[] = {emitUpdateLocation(CGF, Loc), getThreadID(CGF, Loc)};
CommonActionTy Action(createRuntimeFunction(OMPRTL__kmpc_single), Args,
createRuntimeFunction(OMPRTL__kmpc_end_single), Args,
/*Conditional=*/true);
SingleOpGen.setAction(Action);
emitInlinedDirective(CGF, OMPD_single, SingleOpGen);
if (DidIt.isValid()) {
// did_it = 1;
CGF.Builder.CreateStore(CGF.Builder.getInt32(1), DidIt);
}
Action.Done(CGF);
// call __kmpc_copyprivate(ident_t *, gtid, <buf_size>, <copyprivate list>,
// <copy_func>, did_it);
if (DidIt.isValid()) {
llvm::APInt ArraySize(/*unsigned int numBits=*/32, CopyprivateVars.size());
auto CopyprivateArrayTy =
C.getConstantArrayType(C.VoidPtrTy, ArraySize, ArrayType::Normal,
/*IndexTypeQuals=*/0);
// Create a list of all private variables for copyprivate.
Address CopyprivateList =
CGF.CreateMemTemp(CopyprivateArrayTy, ".omp.copyprivate.cpr_list");
for (unsigned I = 0, E = CopyprivateVars.size(); I < E; ++I) {
Address Elem = CGF.Builder.CreateConstArrayGEP(
CopyprivateList, I, CGF.getPointerSize());
CGF.Builder.CreateStore(
CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
CGF.EmitLValue(CopyprivateVars[I]).getPointer(), CGF.VoidPtrTy),
Elem);
}
// Build function that copies private values from single region to all other
// threads in the corresponding parallel region.
auto *CpyFn = emitCopyprivateCopyFunction(
CGM, CGF.ConvertTypeForMem(CopyprivateArrayTy)->getPointerTo(),
CopyprivateVars, SrcExprs, DstExprs, AssignmentOps);
auto *BufSize = CGF.getTypeSize(CopyprivateArrayTy);
Address CL =
CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(CopyprivateList,
CGF.VoidPtrTy);
auto *DidItVal = CGF.Builder.CreateLoad(DidIt);
llvm::Value *Args[] = {
emitUpdateLocation(CGF, Loc), // ident_t *<loc>
getThreadID(CGF, Loc), // i32 <gtid>
BufSize, // size_t <buf_size>
CL.getPointer(), // void *<copyprivate list>
CpyFn, // void (*) (void *, void *) <copy_func>
DidItVal // i32 did_it
};
CGF.EmitRuntimeCall(createRuntimeFunction(OMPRTL__kmpc_copyprivate), Args);
}
}
void CGOpenMPRuntime::emitOrderedRegion(CodeGenFunction &CGF,
const RegionCodeGenTy &OrderedOpGen,
SourceLocation Loc, bool IsThreads) {
if (!CGF.HaveInsertPoint())
return;
// __kmpc_ordered(ident_t *, gtid);
// OrderedOpGen();
// __kmpc_end_ordered(ident_t *, gtid);
// Prepare arguments and build a call to __kmpc_ordered
if (IsThreads) {
llvm::Value *Args[] = {emitUpdateLocation(CGF, Loc), getThreadID(CGF, Loc)};
CommonActionTy Action(createRuntimeFunction(OMPRTL__kmpc_ordered), Args,
createRuntimeFunction(OMPRTL__kmpc_end_ordered),
Args);
OrderedOpGen.setAction(Action);
emitInlinedDirective(CGF, OMPD_ordered, OrderedOpGen);
return;
}
emitInlinedDirective(CGF, OMPD_ordered, OrderedOpGen);
}
void CGOpenMPRuntime::emitBarrierCall(CodeGenFunction &CGF, SourceLocation Loc,
OpenMPDirectiveKind Kind, bool EmitChecks,
bool ForceSimpleCall) {
if (!CGF.HaveInsertPoint())
return;
// Build call __kmpc_cancel_barrier(loc, thread_id);
// Build call __kmpc_barrier(loc, thread_id);
unsigned Flags;
if (Kind == OMPD_for)
Flags = OMP_IDENT_BARRIER_IMPL_FOR;
else if (Kind == OMPD_sections)
Flags = OMP_IDENT_BARRIER_IMPL_SECTIONS;
else if (Kind == OMPD_single)
Flags = OMP_IDENT_BARRIER_IMPL_SINGLE;
else if (Kind == OMPD_barrier)
Flags = OMP_IDENT_BARRIER_EXPL;
else
Flags = OMP_IDENT_BARRIER_IMPL;
// Build call __kmpc_cancel_barrier(loc, thread_id) or __kmpc_barrier(loc,
// thread_id);
llvm::Value *Args[] = {emitUpdateLocation(CGF, Loc, Flags),
getThreadID(CGF, Loc)};
if (auto *OMPRegionInfo =
dyn_cast_or_null<CGOpenMPRegionInfo>(CGF.CapturedStmtInfo)) {
if (!ForceSimpleCall && OMPRegionInfo->hasCancel()) {
auto *Result = CGF.EmitRuntimeCall(
createRuntimeFunction(OMPRTL__kmpc_cancel_barrier), Args);
if (EmitChecks) {
// if (__kmpc_cancel_barrier()) {
// exit from construct;
// }
auto *ExitBB = CGF.createBasicBlock(".cancel.exit");
auto *ContBB = CGF.createBasicBlock(".cancel.continue");
auto *Cmp = CGF.Builder.CreateIsNotNull(Result);
CGF.Builder.CreateCondBr(Cmp, ExitBB, ContBB);
CGF.EmitBlock(ExitBB);
// exit from construct;
auto CancelDestination =
CGF.getOMPCancelDestination(OMPRegionInfo->getDirectiveKind());
CGF.EmitBranchThroughCleanup(CancelDestination);
CGF.EmitBlock(ContBB, /*IsFinished=*/true);
}
return;
}
}
CGF.EmitRuntimeCall(createRuntimeFunction(OMPRTL__kmpc_barrier), Args);
}
/// \brief Map the OpenMP loop schedule to the runtime enumeration.
static OpenMPSchedType getRuntimeSchedule(OpenMPScheduleClauseKind ScheduleKind,
bool Chunked, bool Ordered) {
switch (ScheduleKind) {
case OMPC_SCHEDULE_static:
return Chunked ? (Ordered ? OMP_ord_static_chunked : OMP_sch_static_chunked)
: (Ordered ? OMP_ord_static : OMP_sch_static);
case OMPC_SCHEDULE_dynamic:
return Ordered ? OMP_ord_dynamic_chunked : OMP_sch_dynamic_chunked;
case OMPC_SCHEDULE_guided:
return Ordered ? OMP_ord_guided_chunked : OMP_sch_guided_chunked;
case OMPC_SCHEDULE_runtime:
return Ordered ? OMP_ord_runtime : OMP_sch_runtime;
case OMPC_SCHEDULE_auto:
return Ordered ? OMP_ord_auto : OMP_sch_auto;
case OMPC_SCHEDULE_unknown:
assert(!Chunked && "chunk was specified but schedule kind not known");
return Ordered ? OMP_ord_static : OMP_sch_static;
}
llvm_unreachable("Unexpected runtime schedule");
}
/// \brief Map the OpenMP distribute schedule to the runtime enumeration.
static OpenMPSchedType
getRuntimeSchedule(OpenMPDistScheduleClauseKind ScheduleKind, bool Chunked) {
// only static is allowed for dist_schedule
return Chunked ? OMP_dist_sch_static_chunked : OMP_dist_sch_static;
}
bool CGOpenMPRuntime::isStaticNonchunked(OpenMPScheduleClauseKind ScheduleKind,
bool Chunked) const {
auto Schedule = getRuntimeSchedule(ScheduleKind, Chunked, /*Ordered=*/false);
return Schedule == OMP_sch_static;
}
bool CGOpenMPRuntime::isStaticNonchunked(
OpenMPDistScheduleClauseKind ScheduleKind, bool Chunked) const {
auto Schedule = getRuntimeSchedule(ScheduleKind, Chunked);
return Schedule == OMP_dist_sch_static;
}
bool CGOpenMPRuntime::isDynamic(OpenMPScheduleClauseKind ScheduleKind) const {
auto Schedule =
getRuntimeSchedule(ScheduleKind, /*Chunked=*/false, /*Ordered=*/false);
assert(Schedule != OMP_sch_static_chunked && "cannot be chunked here");
return Schedule != OMP_sch_static;
}
static int addMonoNonMonoModifier(OpenMPSchedType Schedule,
OpenMPScheduleClauseModifier M1,
OpenMPScheduleClauseModifier M2) {
int Modifier = 0;
switch (M1) {
case OMPC_SCHEDULE_MODIFIER_monotonic:
Modifier = OMP_sch_modifier_monotonic;
break;
case OMPC_SCHEDULE_MODIFIER_nonmonotonic:
Modifier = OMP_sch_modifier_nonmonotonic;
break;
case OMPC_SCHEDULE_MODIFIER_simd:
if (Schedule == OMP_sch_static_chunked)
Schedule = OMP_sch_static_balanced_chunked;
break;
case OMPC_SCHEDULE_MODIFIER_last:
case OMPC_SCHEDULE_MODIFIER_unknown:
break;
}
switch (M2) {
case OMPC_SCHEDULE_MODIFIER_monotonic:
Modifier = OMP_sch_modifier_monotonic;
break;
case OMPC_SCHEDULE_MODIFIER_nonmonotonic:
Modifier = OMP_sch_modifier_nonmonotonic;
break;
case OMPC_SCHEDULE_MODIFIER_simd:
if (Schedule == OMP_sch_static_chunked)
Schedule = OMP_sch_static_balanced_chunked;
break;
case OMPC_SCHEDULE_MODIFIER_last:
case OMPC_SCHEDULE_MODIFIER_unknown:
break;
}
return Schedule | Modifier;
}
void CGOpenMPRuntime::emitForDispatchInit(CodeGenFunction &CGF,
SourceLocation Loc,
const OpenMPScheduleTy &ScheduleKind,
unsigned IVSize, bool IVSigned,
bool Ordered, llvm::Value *UB,
llvm::Value *Chunk) {
if (!CGF.HaveInsertPoint())
return;
OpenMPSchedType Schedule =
getRuntimeSchedule(ScheduleKind.Schedule, Chunk != nullptr, Ordered);
assert(Ordered ||
(Schedule != OMP_sch_static && Schedule != OMP_sch_static_chunked &&
Schedule != OMP_ord_static && Schedule != OMP_ord_static_chunked &&
Schedule != OMP_sch_static_balanced_chunked));
// Call __kmpc_dispatch_init(
// ident_t *loc, kmp_int32 tid, kmp_int32 schedule,
// kmp_int[32|64] lower, kmp_int[32|64] upper,
// kmp_int[32|64] stride, kmp_int[32|64] chunk);
// If the Chunk was not specified in the clause - use default value 1.
if (Chunk == nullptr)
Chunk = CGF.Builder.getIntN(IVSize, 1);
llvm::Value *Args[] = {
emitUpdateLocation(CGF, Loc), getThreadID(CGF, Loc),
CGF.Builder.getInt32(addMonoNonMonoModifier(
Schedule, ScheduleKind.M1, ScheduleKind.M2)), // Schedule type
CGF.Builder.getIntN(IVSize, 0), // Lower
UB, // Upper
CGF.Builder.getIntN(IVSize, 1), // Stride
Chunk // Chunk
};
CGF.EmitRuntimeCall(createDispatchInitFunction(IVSize, IVSigned), Args);
}
static void emitForStaticInitCall(
CodeGenFunction &CGF, llvm::Value *UpdateLocation, llvm::Value *ThreadId,
llvm::Constant *ForStaticInitFunction, OpenMPSchedType Schedule,
OpenMPScheduleClauseModifier M1, OpenMPScheduleClauseModifier M2,
unsigned IVSize, bool Ordered, Address IL, Address LB, Address UB,
Address ST, llvm::Value *Chunk) {
if (!CGF.HaveInsertPoint())
return;
assert(!Ordered);
assert(Schedule == OMP_sch_static || Schedule == OMP_sch_static_chunked ||
Schedule == OMP_sch_static_balanced_chunked ||
Schedule == OMP_ord_static || Schedule == OMP_ord_static_chunked ||
Schedule == OMP_dist_sch_static ||
Schedule == OMP_dist_sch_static_chunked);
// Call __kmpc_for_static_init(
// ident_t *loc, kmp_int32 tid, kmp_int32 schedtype,
// kmp_int32 *p_lastiter, kmp_int[32|64] *p_lower,
// kmp_int[32|64] *p_upper, kmp_int[32|64] *p_stride,
// kmp_int[32|64] incr, kmp_int[32|64] chunk);
if (Chunk == nullptr) {
assert((Schedule == OMP_sch_static || Schedule == OMP_ord_static ||
Schedule == OMP_dist_sch_static) &&
"expected static non-chunked schedule");
// If the Chunk was not specified in the clause - use default value 1.
Chunk = CGF.Builder.getIntN(IVSize, 1);
} else {
assert((Schedule == OMP_sch_static_chunked ||
Schedule == OMP_sch_static_balanced_chunked ||
Schedule == OMP_ord_static_chunked ||
Schedule == OMP_dist_sch_static_chunked) &&
"expected static chunked schedule");
}
llvm::Value *Args[] = {
UpdateLocation, ThreadId, CGF.Builder.getInt32(addMonoNonMonoModifier(
Schedule, M1, M2)), // Schedule type
IL.getPointer(), // &isLastIter
LB.getPointer(), // &LB
UB.getPointer(), // &UB
ST.getPointer(), // &Stride
CGF.Builder.getIntN(IVSize, 1), // Incr
Chunk // Chunk
};
CGF.EmitRuntimeCall(ForStaticInitFunction, Args);
}
void CGOpenMPRuntime::emitForStaticInit(CodeGenFunction &CGF,
SourceLocation Loc,
const OpenMPScheduleTy &ScheduleKind,
unsigned IVSize, bool IVSigned,
bool Ordered, Address IL, Address LB,
Address UB, Address ST,
llvm::Value *Chunk) {
OpenMPSchedType ScheduleNum =
getRuntimeSchedule(ScheduleKind.Schedule, Chunk != nullptr, Ordered);
auto *UpdatedLocation = emitUpdateLocation(CGF, Loc);
auto *ThreadId = getThreadID(CGF, Loc);
auto *StaticInitFunction = createForStaticInitFunction(IVSize, IVSigned);
emitForStaticInitCall(CGF, UpdatedLocation, ThreadId, StaticInitFunction,
ScheduleNum, ScheduleKind.M1, ScheduleKind.M2, IVSize,
Ordered, IL, LB, UB, ST, Chunk);
}
void CGOpenMPRuntime::emitDistributeStaticInit(
CodeGenFunction &CGF, SourceLocation Loc,
OpenMPDistScheduleClauseKind SchedKind, unsigned IVSize, bool IVSigned,
bool Ordered, Address IL, Address LB, Address UB, Address ST,
llvm::Value *Chunk) {
OpenMPSchedType ScheduleNum = getRuntimeSchedule(SchedKind, Chunk != nullptr);
auto *UpdatedLocation = emitUpdateLocation(CGF, Loc);
auto *ThreadId = getThreadID(CGF, Loc);
auto *StaticInitFunction = createForStaticInitFunction(IVSize, IVSigned);
emitForStaticInitCall(CGF, UpdatedLocation, ThreadId, StaticInitFunction,
ScheduleNum, OMPC_SCHEDULE_MODIFIER_unknown,
OMPC_SCHEDULE_MODIFIER_unknown, IVSize, Ordered, IL, LB,
UB, ST, Chunk);
}
void CGOpenMPRuntime::emitForStaticFinish(CodeGenFunction &CGF,
SourceLocation Loc) {
if (!CGF.HaveInsertPoint())
return;
// Call __kmpc_for_static_fini(ident_t *loc, kmp_int32 tid);
llvm::Value *Args[] = {emitUpdateLocation(CGF, Loc), getThreadID(CGF, Loc)};
CGF.EmitRuntimeCall(createRuntimeFunction(OMPRTL__kmpc_for_static_fini),
Args);
}
void CGOpenMPRuntime::emitForOrderedIterationEnd(CodeGenFunction &CGF,
SourceLocation Loc,
unsigned IVSize,
bool IVSigned) {
if (!CGF.HaveInsertPoint())
return;
// Call __kmpc_for_dynamic_fini_(4|8)[u](ident_t *loc, kmp_int32 tid);
llvm::Value *Args[] = {emitUpdateLocation(CGF, Loc), getThreadID(CGF, Loc)};
CGF.EmitRuntimeCall(createDispatchFiniFunction(IVSize, IVSigned), Args);
}
llvm::Value *CGOpenMPRuntime::emitForNext(CodeGenFunction &CGF,
SourceLocation Loc, unsigned IVSize,
bool IVSigned, Address IL,
Address LB, Address UB,
Address ST) {
// Call __kmpc_dispatch_next(
// ident_t *loc, kmp_int32 tid, kmp_int32 *p_lastiter,
// kmp_int[32|64] *p_lower, kmp_int[32|64] *p_upper,
// kmp_int[32|64] *p_stride);
llvm::Value *Args[] = {
emitUpdateLocation(CGF, Loc),
getThreadID(CGF, Loc),
IL.getPointer(), // &isLastIter
LB.getPointer(), // &Lower
UB.getPointer(), // &Upper
ST.getPointer() // &Stride
};
llvm::Value *Call =
CGF.EmitRuntimeCall(createDispatchNextFunction(IVSize, IVSigned), Args);
return CGF.EmitScalarConversion(
Call, CGF.getContext().getIntTypeForBitwidth(32, /* Signed */ true),
CGF.getContext().BoolTy, Loc);
}
void CGOpenMPRuntime::emitNumThreadsClause(CodeGenFunction &CGF,
llvm::Value *NumThreads,
SourceLocation Loc) {
if (!CGF.HaveInsertPoint())
return;
// Build call __kmpc_push_num_threads(&loc, global_tid, num_threads)
llvm::Value *Args[] = {
emitUpdateLocation(CGF, Loc), getThreadID(CGF, Loc),
CGF.Builder.CreateIntCast(NumThreads, CGF.Int32Ty, /*isSigned*/ true)};
CGF.EmitRuntimeCall(createRuntimeFunction(OMPRTL__kmpc_push_num_threads),
Args);
}
void CGOpenMPRuntime::emitProcBindClause(CodeGenFunction &CGF,
OpenMPProcBindClauseKind ProcBind,
SourceLocation Loc) {
if (!CGF.HaveInsertPoint())
return;
// Constants for proc bind value accepted by the runtime.
enum ProcBindTy {
ProcBindFalse = 0,
ProcBindTrue,
ProcBindMaster,
ProcBindClose,
ProcBindSpread,
ProcBindIntel,
ProcBindDefault
} RuntimeProcBind;
switch (ProcBind) {
case OMPC_PROC_BIND_master:
RuntimeProcBind = ProcBindMaster;
break;
case OMPC_PROC_BIND_close:
RuntimeProcBind = ProcBindClose;
break;
case OMPC_PROC_BIND_spread:
RuntimeProcBind = ProcBindSpread;
break;
case OMPC_PROC_BIND_unknown:
llvm_unreachable("Unsupported proc_bind value.");
}
// Build call __kmpc_push_proc_bind(&loc, global_tid, proc_bind)
llvm::Value *Args[] = {
emitUpdateLocation(CGF, Loc), getThreadID(CGF, Loc),
llvm::ConstantInt::get(CGM.IntTy, RuntimeProcBind, /*isSigned=*/true)};
CGF.EmitRuntimeCall(createRuntimeFunction(OMPRTL__kmpc_push_proc_bind), Args);
}
void CGOpenMPRuntime::emitFlush(CodeGenFunction &CGF, ArrayRef<const Expr *>,
SourceLocation Loc) {
if (!CGF.HaveInsertPoint())
return;
// Build call void __kmpc_flush(ident_t *loc)
CGF.EmitRuntimeCall(createRuntimeFunction(OMPRTL__kmpc_flush),
emitUpdateLocation(CGF, Loc));
}
namespace {
/// \brief Indexes of fields for type kmp_task_t.
enum KmpTaskTFields {
/// \brief List of shared variables.
KmpTaskTShareds,
/// \brief Task routine.
KmpTaskTRoutine,
/// \brief Partition id for the untied tasks.
KmpTaskTPartId,
/// Function with call of destructors for private variables.
Data1,
/// Task priority.
Data2,
/// (Taskloops only) Lower bound.
KmpTaskTLowerBound,
/// (Taskloops only) Upper bound.
KmpTaskTUpperBound,
/// (Taskloops only) Stride.
KmpTaskTStride,
/// (Taskloops only) Is last iteration flag.
KmpTaskTLastIter,
};
} // anonymous namespace
bool CGOpenMPRuntime::OffloadEntriesInfoManagerTy::empty() const {
// FIXME: Add other entries type when they become supported.
return OffloadEntriesTargetRegion.empty();
}
/// \brief Initialize target region entry.
void CGOpenMPRuntime::OffloadEntriesInfoManagerTy::
initializeTargetRegionEntryInfo(unsigned DeviceID, unsigned FileID,
StringRef ParentName, unsigned LineNum,
unsigned Order) {
assert(CGM.getLangOpts().OpenMPIsDevice && "Initialization of entries is "
"only required for the device "
"code generation.");
OffloadEntriesTargetRegion[DeviceID][FileID][ParentName][LineNum] =
OffloadEntryInfoTargetRegion(Order, /*Addr=*/nullptr, /*ID=*/nullptr,
/*Flags=*/0);
++OffloadingEntriesNum;
}
void CGOpenMPRuntime::OffloadEntriesInfoManagerTy::
registerTargetRegionEntryInfo(unsigned DeviceID, unsigned FileID,
StringRef ParentName, unsigned LineNum,
llvm::Constant *Addr, llvm::Constant *ID,
int32_t Flags) {
// If we are emitting code for a target, the entry is already initialized,
// only has to be registered.
if (CGM.getLangOpts().OpenMPIsDevice) {
assert(hasTargetRegionEntryInfo(DeviceID, FileID, ParentName, LineNum) &&
"Entry must exist.");
auto &Entry =
OffloadEntriesTargetRegion[DeviceID][FileID][ParentName][LineNum];
assert(Entry.isValid() && "Entry not initialized!");
Entry.setAddress(Addr);
Entry.setID(ID);
Entry.setFlags(Flags);
return;
} else {
OffloadEntryInfoTargetRegion Entry(OffloadingEntriesNum++, Addr, ID, Flags);
OffloadEntriesTargetRegion[DeviceID][FileID][ParentName][LineNum] = Entry;
}
}
bool CGOpenMPRuntime::OffloadEntriesInfoManagerTy::hasTargetRegionEntryInfo(
unsigned DeviceID, unsigned FileID, StringRef ParentName,
unsigned LineNum) const {
auto PerDevice = OffloadEntriesTargetRegion.find(DeviceID);
if (PerDevice == OffloadEntriesTargetRegion.end())
return false;
auto PerFile = PerDevice->second.find(FileID);
if (PerFile == PerDevice->second.end())
return false;
auto PerParentName = PerFile->second.find(ParentName);
if (PerParentName == PerFile->second.end())
return false;
auto PerLine = PerParentName->second.find(LineNum);
if (PerLine == PerParentName->second.end())
return false;
// Fail if this entry is already registered.
if (PerLine->second.getAddress() || PerLine->second.getID())
return false;
return true;
}
void CGOpenMPRuntime::OffloadEntriesInfoManagerTy::actOnTargetRegionEntriesInfo(
const OffloadTargetRegionEntryInfoActTy &Action) {
// Scan all target region entries and perform the provided action.
for (auto &D : OffloadEntriesTargetRegion)
for (auto &F : D.second)
for (auto &P : F.second)
for (auto &L : P.second)
Action(D.first, F.first, P.first(), L.first, L.second);
}
/// \brief Create a Ctor/Dtor-like function whose body is emitted through
/// \a Codegen. This is used to emit the two functions that register and
/// unregister the descriptor of the current compilation unit.
static llvm::Function *
createOffloadingBinaryDescriptorFunction(CodeGenModule &CGM, StringRef Name,
const RegionCodeGenTy &Codegen) {
auto &C = CGM.getContext();
FunctionArgList Args;
ImplicitParamDecl DummyPtr(C, /*DC=*/nullptr, SourceLocation(),
/*Id=*/nullptr, C.VoidPtrTy);
Args.push_back(&DummyPtr);
CodeGenFunction CGF(CGM);
auto &FI = CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
auto FTy = CGM.getTypes().GetFunctionType(FI);
auto *Fn =
CGM.CreateGlobalInitOrDestructFunction(FTy, Name, FI, SourceLocation());
CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, FI, Args, SourceLocation());
Codegen(CGF);
CGF.FinishFunction();
return Fn;
}
llvm::Function *
CGOpenMPRuntime::createOffloadingBinaryDescriptorRegistration() {
// If we don't have entries or if we are emitting code for the device, we
// don't need to do anything.
if (CGM.getLangOpts().OpenMPIsDevice || OffloadEntriesInfoManager.empty())
return nullptr;
auto &M = CGM.getModule();
auto &C = CGM.getContext();
// Get list of devices we care about
auto &Devices = CGM.getLangOpts().OMPTargetTriples;
// We should be creating an offloading descriptor only if there are devices
// specified.
assert(!Devices.empty() && "No OpenMP offloading devices??");
// Create the external variables that will point to the begin and end of the
// host entries section. These will be defined by the linker.
auto *OffloadEntryTy =
CGM.getTypes().ConvertTypeForMem(getTgtOffloadEntryQTy());
llvm::GlobalVariable *HostEntriesBegin = new llvm::GlobalVariable(
M, OffloadEntryTy, /*isConstant=*/true,
llvm::GlobalValue::ExternalLinkage, /*Initializer=*/nullptr,
".omp_offloading.entries_begin");
llvm::GlobalVariable *HostEntriesEnd = new llvm::GlobalVariable(
M, OffloadEntryTy, /*isConstant=*/true,
llvm::GlobalValue::ExternalLinkage, /*Initializer=*/nullptr,
".omp_offloading.entries_end");
// Create all device images
auto *DeviceImageTy = cast<llvm::StructType>(
CGM.getTypes().ConvertTypeForMem(getTgtDeviceImageQTy()));
ConstantInitBuilder DeviceImagesBuilder(CGM);
auto DeviceImagesEntries = DeviceImagesBuilder.beginArray(DeviceImageTy);
for (unsigned i = 0; i < Devices.size(); ++i) {
StringRef T = Devices[i].getTriple();
auto *ImgBegin = new llvm::GlobalVariable(
M, CGM.Int8Ty, /*isConstant=*/true, llvm::GlobalValue::ExternalLinkage,
/*Initializer=*/nullptr,
Twine(".omp_offloading.img_start.") + Twine(T));
auto *ImgEnd = new llvm::GlobalVariable(
M, CGM.Int8Ty, /*isConstant=*/true, llvm::GlobalValue::ExternalLinkage,
/*Initializer=*/nullptr, Twine(".omp_offloading.img_end.") + Twine(T));
auto Dev = DeviceImagesEntries.beginStruct(DeviceImageTy);
Dev.add(ImgBegin);
Dev.add(ImgEnd);
Dev.add(HostEntriesBegin);
Dev.add(HostEntriesEnd);
Dev.finishAndAddTo(DeviceImagesEntries);
}
// Create device images global array.
llvm::GlobalVariable *DeviceImages =
DeviceImagesEntries.finishAndCreateGlobal(".omp_offloading.device_images",
CGM.getPointerAlign(),
/*isConstant=*/true);
DeviceImages->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global);
// This is a Zero array to be used in the creation of the constant expressions
llvm::Constant *Index[] = {llvm::Constant::getNullValue(CGM.Int32Ty),
llvm::Constant::getNullValue(CGM.Int32Ty)};
// Create the target region descriptor.
auto *BinaryDescriptorTy = cast<llvm::StructType>(
CGM.getTypes().ConvertTypeForMem(getTgtBinaryDescriptorQTy()));
ConstantInitBuilder DescBuilder(CGM);
auto DescInit = DescBuilder.beginStruct(BinaryDescriptorTy);
DescInit.addInt(CGM.Int32Ty, Devices.size());
DescInit.add(llvm::ConstantExpr::getGetElementPtr(DeviceImages->getValueType(),
DeviceImages,
Index));
DescInit.add(HostEntriesBegin);
DescInit.add(HostEntriesEnd);
auto *Desc = DescInit.finishAndCreateGlobal(".omp_offloading.descriptor",
CGM.getPointerAlign(),
/*isConstant=*/true);
// Emit code to register or unregister the descriptor at execution
// startup or closing, respectively.
// Create a variable to drive the registration and unregistration of the
// descriptor, so we can reuse the logic that emits Ctors and Dtors.
auto *IdentInfo = &C.Idents.get(".omp_offloading.reg_unreg_var");
ImplicitParamDecl RegUnregVar(C, C.getTranslationUnitDecl(), SourceLocation(),
IdentInfo, C.CharTy);
auto *UnRegFn = createOffloadingBinaryDescriptorFunction(
CGM, ".omp_offloading.descriptor_unreg",
[&](CodeGenFunction &CGF, PrePostActionTy &) {
CGF.EmitCallOrInvoke(createRuntimeFunction(OMPRTL__tgt_unregister_lib),
Desc);
});
auto *RegFn = createOffloadingBinaryDescriptorFunction(
CGM, ".omp_offloading.descriptor_reg",
[&](CodeGenFunction &CGF, PrePostActionTy &) {
CGF.EmitCallOrInvoke(createRuntimeFunction(OMPRTL__tgt_register_lib),
Desc);
CGM.getCXXABI().registerGlobalDtor(CGF, RegUnregVar, UnRegFn, Desc);
});
return RegFn;
}
void CGOpenMPRuntime::createOffloadEntry(llvm::Constant *ID,
llvm::Constant *Addr, uint64_t Size,
int32_t Flags) {
StringRef Name = Addr->getName();
auto *TgtOffloadEntryType = cast<llvm::StructType>(
CGM.getTypes().ConvertTypeForMem(getTgtOffloadEntryQTy()));
llvm::LLVMContext &C = CGM.getModule().getContext();
llvm::Module &M = CGM.getModule();
// Make sure the address has the right type.
llvm::Constant *AddrPtr = llvm::ConstantExpr::getBitCast(ID, CGM.VoidPtrTy);
// Create constant string with the name.
llvm::Constant *StrPtrInit = llvm::ConstantDataArray::getString(C, Name);
llvm::GlobalVariable *Str =
new llvm::GlobalVariable(M, StrPtrInit->getType(), /*isConstant=*/true,
llvm::GlobalValue::InternalLinkage, StrPtrInit,
".omp_offloading.entry_name");
Str->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global);
llvm::Constant *StrPtr = llvm::ConstantExpr::getBitCast(Str, CGM.Int8PtrTy);
// We can't have any padding between symbols, so we need to have 1-byte
// alignment.
auto Align = CharUnits::fromQuantity(1);
// Create the entry struct.
ConstantInitBuilder EntryBuilder(CGM);
auto EntryInit = EntryBuilder.beginStruct(TgtOffloadEntryType);
EntryInit.add(AddrPtr);
EntryInit.add(StrPtr);
EntryInit.addInt(CGM.SizeTy, Size);
EntryInit.addInt(CGM.Int32Ty, Flags);
EntryInit.addInt(CGM.Int32Ty, 0);
llvm::GlobalVariable *Entry =
EntryInit.finishAndCreateGlobal(".omp_offloading.entry",
Align,
/*constant*/ true,
llvm::GlobalValue::ExternalLinkage);
// The entry has to be created in the section the linker expects it to be.
Entry->setSection(".omp_offloading.entries");
}
void CGOpenMPRuntime::createOffloadEntriesAndInfoMetadata() {
// Emit the offloading entries and metadata so that the device codegen side
// can easily figure out what to emit. The produced metadata looks like
// this:
//
// !omp_offload.info = !{!1, ...}
//
// Right now we only generate metadata for function that contain target
// regions.
// If we do not have entries, we dont need to do anything.
if (OffloadEntriesInfoManager.empty())
return;
llvm::Module &M = CGM.getModule();
llvm::LLVMContext &C = M.getContext();
SmallVector<OffloadEntriesInfoManagerTy::OffloadEntryInfo *, 16>
OrderedEntries(OffloadEntriesInfoManager.size());
// Create the offloading info metadata node.
llvm::NamedMDNode *MD = M.getOrInsertNamedMetadata("omp_offload.info");
// Auxiliar methods to create metadata values and strings.
auto getMDInt = [&](unsigned v) {
return llvm::ConstantAsMetadata::get(
llvm::ConstantInt::get(llvm::Type::getInt32Ty(C), v));
};
auto getMDString = [&](StringRef v) { return llvm::MDString::get(C, v); };
// Create function that emits metadata for each target region entry;
auto &&TargetRegionMetadataEmitter = [&](
unsigned DeviceID, unsigned FileID, StringRef ParentName, unsigned Line,
OffloadEntriesInfoManagerTy::OffloadEntryInfoTargetRegion &E) {
llvm::SmallVector<llvm::Metadata *, 32> Ops;
// Generate metadata for target regions. Each entry of this metadata
// contains:
// - Entry 0 -> Kind of this type of metadata (0).
// - Entry 1 -> Device ID of the file where the entry was identified.
// - Entry 2 -> File ID of the file where the entry was identified.
// - Entry 3 -> Mangled name of the function where the entry was identified.
// - Entry 4 -> Line in the file where the entry was identified.
// - Entry 5 -> Order the entry was created.
// The first element of the metadata node is the kind.
Ops.push_back(getMDInt(E.getKind()));
Ops.push_back(getMDInt(DeviceID));
Ops.push_back(getMDInt(FileID));
Ops.push_back(getMDString(ParentName));
Ops.push_back(getMDInt(Line));
Ops.push_back(getMDInt(E.getOrder()));
// Save this entry in the right position of the ordered entries array.
OrderedEntries[E.getOrder()] = &E;
// Add metadata to the named metadata node.
MD->addOperand(llvm::MDNode::get(C, Ops));
};
OffloadEntriesInfoManager.actOnTargetRegionEntriesInfo(
TargetRegionMetadataEmitter);
for (auto *E : OrderedEntries) {
assert(E && "All ordered entries must exist!");
if (auto *CE =
dyn_cast<OffloadEntriesInfoManagerTy::OffloadEntryInfoTargetRegion>(
E)) {
assert(CE->getID() && CE->getAddress() &&
"Entry ID and Addr are invalid!");
createOffloadEntry(CE->getID(), CE->getAddress(), /*Size=*/0);
} else
llvm_unreachable("Unsupported entry kind.");
}
}
/// \brief Loads all the offload entries information from the host IR
/// metadata.
void CGOpenMPRuntime::loadOffloadInfoMetadata() {
// If we are in target mode, load the metadata from the host IR. This code has
// to match the metadaata creation in createOffloadEntriesAndInfoMetadata().
if (!CGM.getLangOpts().OpenMPIsDevice)
return;
if (CGM.getLangOpts().OMPHostIRFile.empty())
return;
auto Buf = llvm::MemoryBuffer::getFile(CGM.getLangOpts().OMPHostIRFile);
if (Buf.getError())
return;
llvm::LLVMContext C;
auto ME = expectedToErrorOrAndEmitErrors(
C, llvm::parseBitcodeFile(Buf.get()->getMemBufferRef(), C));
if (ME.getError())
return;
llvm::NamedMDNode *MD = ME.get()->getNamedMetadata("omp_offload.info");
if (!MD)
return;
for (auto I : MD->operands()) {
llvm::MDNode *MN = cast<llvm::MDNode>(I);
auto getMDInt = [&](unsigned Idx) {
llvm::ConstantAsMetadata *V =
cast<llvm::ConstantAsMetadata>(MN->getOperand(Idx));
return cast<llvm::ConstantInt>(V->getValue())->getZExtValue();
};
auto getMDString = [&](unsigned Idx) {
llvm::MDString *V = cast<llvm::MDString>(MN->getOperand(Idx));
return V->getString();
};
switch (getMDInt(0)) {
default:
llvm_unreachable("Unexpected metadata!");
break;
case OffloadEntriesInfoManagerTy::OffloadEntryInfo::
OFFLOAD_ENTRY_INFO_TARGET_REGION:
OffloadEntriesInfoManager.initializeTargetRegionEntryInfo(
/*DeviceID=*/getMDInt(1), /*FileID=*/getMDInt(2),
/*ParentName=*/getMDString(3), /*Line=*/getMDInt(4),
/*Order=*/getMDInt(5));
break;
}
}
}
void CGOpenMPRuntime::emitKmpRoutineEntryT(QualType KmpInt32Ty) {
if (!KmpRoutineEntryPtrTy) {
// Build typedef kmp_int32 (* kmp_routine_entry_t)(kmp_int32, void *); type.
auto &C = CGM.getContext();
QualType KmpRoutineEntryTyArgs[] = {KmpInt32Ty, C.VoidPtrTy};
FunctionProtoType::ExtProtoInfo EPI;
KmpRoutineEntryPtrQTy = C.getPointerType(
C.getFunctionType(KmpInt32Ty, KmpRoutineEntryTyArgs, EPI));
KmpRoutineEntryPtrTy = CGM.getTypes().ConvertType(KmpRoutineEntryPtrQTy);
}
}
static FieldDecl *addFieldToRecordDecl(ASTContext &C, DeclContext *DC,
QualType FieldTy) {
auto *Field = FieldDecl::Create(
C, DC, SourceLocation(), SourceLocation(), /*Id=*/nullptr, FieldTy,
C.getTrivialTypeSourceInfo(FieldTy, SourceLocation()),
/*BW=*/nullptr, /*Mutable=*/false, /*InitStyle=*/ICIS_NoInit);
Field->setAccess(AS_public);
DC->addDecl(Field);
return Field;
}
QualType CGOpenMPRuntime::getTgtOffloadEntryQTy() {
// Make sure the type of the entry is already created. This is the type we
// have to create:
// struct __tgt_offload_entry{
// void *addr; // Pointer to the offload entry info.
// // (function or global)
// char *name; // Name of the function or global.
// size_t size; // Size of the entry info (0 if it a function).
// int32_t flags; // Flags associated with the entry, e.g. 'link'.
// int32_t reserved; // Reserved, to use by the runtime library.
// };
if (TgtOffloadEntryQTy.isNull()) {
ASTContext &C = CGM.getContext();
auto *RD = C.buildImplicitRecord("__tgt_offload_entry");
RD->startDefinition();
addFieldToRecordDecl(C, RD, C.VoidPtrTy);
addFieldToRecordDecl(C, RD, C.getPointerType(C.CharTy));
addFieldToRecordDecl(C, RD, C.getSizeType());
addFieldToRecordDecl(
C, RD, C.getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/true));
addFieldToRecordDecl(
C, RD, C.getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/true));
RD->completeDefinition();
TgtOffloadEntryQTy = C.getRecordType(RD);
}
return TgtOffloadEntryQTy;
}
QualType CGOpenMPRuntime::getTgtDeviceImageQTy() {
// These are the types we need to build:
// struct __tgt_device_image{
// void *ImageStart; // Pointer to the target code start.
// void *ImageEnd; // Pointer to the target code end.
// // We also add the host entries to the device image, as it may be useful
// // for the target runtime to have access to that information.
// __tgt_offload_entry *EntriesBegin; // Begin of the table with all
// // the entries.
// __tgt_offload_entry *EntriesEnd; // End of the table with all the
// // entries (non inclusive).
// };
if (TgtDeviceImageQTy.isNull()) {
ASTContext &C = CGM.getContext();
auto *RD = C.buildImplicitRecord("__tgt_device_image");
RD->startDefinition();
addFieldToRecordDecl(C, RD, C.VoidPtrTy);
addFieldToRecordDecl(C, RD, C.VoidPtrTy);
addFieldToRecordDecl(C, RD, C.getPointerType(getTgtOffloadEntryQTy()));
addFieldToRecordDecl(C, RD, C.getPointerType(getTgtOffloadEntryQTy()));
RD->completeDefinition();
TgtDeviceImageQTy = C.getRecordType(RD);
}
return TgtDeviceImageQTy;
}
QualType CGOpenMPRuntime::getTgtBinaryDescriptorQTy() {
// struct __tgt_bin_desc{
// int32_t NumDevices; // Number of devices supported.
// __tgt_device_image *DeviceImages; // Arrays of device images
// // (one per device).
// __tgt_offload_entry *EntriesBegin; // Begin of the table with all the
// // entries.
// __tgt_offload_entry *EntriesEnd; // End of the table with all the
// // entries (non inclusive).
// };
if (TgtBinaryDescriptorQTy.isNull()) {
ASTContext &C = CGM.getContext();
auto *RD = C.buildImplicitRecord("__tgt_bin_desc");
RD->startDefinition();
addFieldToRecordDecl(
C, RD, C.getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/true));
addFieldToRecordDecl(C, RD, C.getPointerType(getTgtDeviceImageQTy()));
addFieldToRecordDecl(C, RD, C.getPointerType(getTgtOffloadEntryQTy()));
addFieldToRecordDecl(C, RD, C.getPointerType(getTgtOffloadEntryQTy()));
RD->completeDefinition();
TgtBinaryDescriptorQTy = C.getRecordType(RD);
}
return TgtBinaryDescriptorQTy;
}
namespace {
struct PrivateHelpersTy {
PrivateHelpersTy(const VarDecl *Original, const VarDecl *PrivateCopy,
const VarDecl *PrivateElemInit)
: Original(Original), PrivateCopy(PrivateCopy),
PrivateElemInit(PrivateElemInit) {}
const VarDecl *Original;
const VarDecl *PrivateCopy;
const VarDecl *PrivateElemInit;
};
typedef std::pair<CharUnits /*Align*/, PrivateHelpersTy> PrivateDataTy;
} // anonymous namespace
static RecordDecl *
createPrivatesRecordDecl(CodeGenModule &CGM, ArrayRef<PrivateDataTy> Privates) {
if (!Privates.empty()) {
auto &C = CGM.getContext();
// Build struct .kmp_privates_t. {
// /* private vars */
// };
auto *RD = C.buildImplicitRecord(".kmp_privates.t");
RD->startDefinition();
for (auto &&Pair : Privates) {
auto *VD = Pair.second.Original;
auto Type = VD->getType();
Type = Type.getNonReferenceType();
auto *FD = addFieldToRecordDecl(C, RD, Type);
if (VD->hasAttrs()) {
for (specific_attr_iterator<AlignedAttr> I(VD->getAttrs().begin()),
E(VD->getAttrs().end());
I != E; ++I)
FD->addAttr(*I);
}
}
RD->completeDefinition();
return RD;
}
return nullptr;
}
static RecordDecl *
createKmpTaskTRecordDecl(CodeGenModule &CGM, OpenMPDirectiveKind Kind,
QualType KmpInt32Ty,
QualType KmpRoutineEntryPointerQTy) {
auto &C = CGM.getContext();
// Build struct kmp_task_t {
// void * shareds;
// kmp_routine_entry_t routine;
// kmp_int32 part_id;
// kmp_cmplrdata_t data1;
// kmp_cmplrdata_t data2;
// For taskloops additional fields:
// kmp_uint64 lb;
// kmp_uint64 ub;
// kmp_int64 st;
// kmp_int32 liter;
// };
auto *UD = C.buildImplicitRecord("kmp_cmplrdata_t", TTK_Union);
UD->startDefinition();
addFieldToRecordDecl(C, UD, KmpInt32Ty);
addFieldToRecordDecl(C, UD, KmpRoutineEntryPointerQTy);
UD->completeDefinition();
QualType KmpCmplrdataTy = C.getRecordType(UD);
auto *RD = C.buildImplicitRecord("kmp_task_t");
RD->startDefinition();
addFieldToRecordDecl(C, RD, C.VoidPtrTy);
addFieldToRecordDecl(C, RD, KmpRoutineEntryPointerQTy);
addFieldToRecordDecl(C, RD, KmpInt32Ty);
addFieldToRecordDecl(C, RD, KmpCmplrdataTy);
addFieldToRecordDecl(C, RD, KmpCmplrdataTy);
if (isOpenMPTaskLoopDirective(Kind)) {
QualType KmpUInt64Ty =
CGM.getContext().getIntTypeForBitwidth(/*DestWidth=*/64, /*Signed=*/0);
QualType KmpInt64Ty =
CGM.getContext().getIntTypeForBitwidth(/*DestWidth=*/64, /*Signed=*/1);
addFieldToRecordDecl(C, RD, KmpUInt64Ty);
addFieldToRecordDecl(C, RD, KmpUInt64Ty);
addFieldToRecordDecl(C, RD, KmpInt64Ty);
addFieldToRecordDecl(C, RD, KmpInt32Ty);
}
RD->completeDefinition();
return RD;
}
static RecordDecl *
createKmpTaskTWithPrivatesRecordDecl(CodeGenModule &CGM, QualType KmpTaskTQTy,
ArrayRef<PrivateDataTy> Privates) {
auto &C = CGM.getContext();
// Build struct kmp_task_t_with_privates {
// kmp_task_t task_data;
// .kmp_privates_t. privates;
// };
auto *RD = C.buildImplicitRecord("kmp_task_t_with_privates");
RD->startDefinition();
addFieldToRecordDecl(C, RD, KmpTaskTQTy);
if (auto *PrivateRD = createPrivatesRecordDecl(CGM, Privates)) {
addFieldToRecordDecl(C, RD, C.getRecordType(PrivateRD));
}
RD->completeDefinition();
return RD;
}
/// \brief Emit a proxy function which accepts kmp_task_t as the second
/// argument.
/// \code
/// kmp_int32 .omp_task_entry.(kmp_int32 gtid, kmp_task_t *tt) {
/// TaskFunction(gtid, tt->part_id, &tt->privates, task_privates_map, tt,
/// For taskloops:
/// tt->task_data.lb, tt->task_data.ub, tt->task_data.st, tt->task_data.liter,
/// tt->shareds);
/// return 0;
/// }
/// \endcode
static llvm::Value *
emitProxyTaskFunction(CodeGenModule &CGM, SourceLocation Loc,
OpenMPDirectiveKind Kind, QualType KmpInt32Ty,
QualType KmpTaskTWithPrivatesPtrQTy,
QualType KmpTaskTWithPrivatesQTy, QualType KmpTaskTQTy,
QualType SharedsPtrTy, llvm::Value *TaskFunction,
llvm::Value *TaskPrivatesMap) {
auto &C = CGM.getContext();
FunctionArgList Args;
ImplicitParamDecl GtidArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, KmpInt32Ty);
ImplicitParamDecl TaskTypeArg(C, /*DC=*/nullptr, Loc,
/*Id=*/nullptr,
KmpTaskTWithPrivatesPtrQTy.withRestrict());
Args.push_back(&GtidArg);
Args.push_back(&TaskTypeArg);
auto &TaskEntryFnInfo =
CGM.getTypes().arrangeBuiltinFunctionDeclaration(KmpInt32Ty, Args);
auto *TaskEntryTy = CGM.getTypes().GetFunctionType(TaskEntryFnInfo);
auto *TaskEntry =
llvm::Function::Create(TaskEntryTy, llvm::GlobalValue::InternalLinkage,
".omp_task_entry.", &CGM.getModule());
CGM.SetInternalFunctionAttributes(/*D=*/nullptr, TaskEntry, TaskEntryFnInfo);
CodeGenFunction CGF(CGM);
CGF.disableDebugInfo();
CGF.StartFunction(GlobalDecl(), KmpInt32Ty, TaskEntry, TaskEntryFnInfo, Args);
// TaskFunction(gtid, tt->task_data.part_id, &tt->privates, task_privates_map,
// tt,
// For taskloops:
// tt->task_data.lb, tt->task_data.ub, tt->task_data.st, tt->task_data.liter,
// tt->task_data.shareds);
auto *GtidParam = CGF.EmitLoadOfScalar(
CGF.GetAddrOfLocalVar(&GtidArg), /*Volatile=*/false, KmpInt32Ty, Loc);
LValue TDBase = CGF.EmitLoadOfPointerLValue(
CGF.GetAddrOfLocalVar(&TaskTypeArg),
KmpTaskTWithPrivatesPtrQTy->castAs<PointerType>());
auto *KmpTaskTWithPrivatesQTyRD =
cast<RecordDecl>(KmpTaskTWithPrivatesQTy->getAsTagDecl());
LValue Base =
CGF.EmitLValueForField(TDBase, *KmpTaskTWithPrivatesQTyRD->field_begin());
auto *KmpTaskTQTyRD = cast<RecordDecl>(KmpTaskTQTy->getAsTagDecl());
auto PartIdFI = std::next(KmpTaskTQTyRD->field_begin(), KmpTaskTPartId);
auto PartIdLVal = CGF.EmitLValueForField(Base, *PartIdFI);
auto *PartidParam = PartIdLVal.getPointer();
auto SharedsFI = std::next(KmpTaskTQTyRD->field_begin(), KmpTaskTShareds);
auto SharedsLVal = CGF.EmitLValueForField(Base, *SharedsFI);
auto *SharedsParam = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
CGF.EmitLoadOfLValue(SharedsLVal, Loc).getScalarVal(),
CGF.ConvertTypeForMem(SharedsPtrTy));
auto PrivatesFI = std::next(KmpTaskTWithPrivatesQTyRD->field_begin(), 1);
llvm::Value *PrivatesParam;
if (PrivatesFI != KmpTaskTWithPrivatesQTyRD->field_end()) {
auto PrivatesLVal = CGF.EmitLValueForField(TDBase, *PrivatesFI);
PrivatesParam = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
PrivatesLVal.getPointer(), CGF.VoidPtrTy);
} else
PrivatesParam = llvm::ConstantPointerNull::get(CGF.VoidPtrTy);
llvm::Value *CommonArgs[] = {GtidParam, PartidParam, PrivatesParam,
TaskPrivatesMap,
CGF.Builder
.CreatePointerBitCastOrAddrSpaceCast(
TDBase.getAddress(), CGF.VoidPtrTy)
.getPointer()};
SmallVector<llvm::Value *, 16> CallArgs(std::begin(CommonArgs),
std::end(CommonArgs));
if (isOpenMPTaskLoopDirective(Kind)) {
auto LBFI = std::next(KmpTaskTQTyRD->field_begin(), KmpTaskTLowerBound);
auto LBLVal = CGF.EmitLValueForField(Base, *LBFI);
auto *LBParam = CGF.EmitLoadOfLValue(LBLVal, Loc).getScalarVal();
auto UBFI = std::next(KmpTaskTQTyRD->field_begin(), KmpTaskTUpperBound);
auto UBLVal = CGF.EmitLValueForField(Base, *UBFI);
auto *UBParam = CGF.EmitLoadOfLValue(UBLVal, Loc).getScalarVal();
auto StFI = std::next(KmpTaskTQTyRD->field_begin(), KmpTaskTStride);
auto StLVal = CGF.EmitLValueForField(Base, *StFI);
auto *StParam = CGF.EmitLoadOfLValue(StLVal, Loc).getScalarVal();
auto LIFI = std::next(KmpTaskTQTyRD->field_begin(), KmpTaskTLastIter);
auto LILVal = CGF.EmitLValueForField(Base, *LIFI);
auto *LIParam = CGF.EmitLoadOfLValue(LILVal, Loc).getScalarVal();
CallArgs.push_back(LBParam);
CallArgs.push_back(UBParam);
CallArgs.push_back(StParam);
CallArgs.push_back(LIParam);
}
CallArgs.push_back(SharedsParam);
CGF.EmitCallOrInvoke(TaskFunction, CallArgs);
CGF.EmitStoreThroughLValue(
RValue::get(CGF.Builder.getInt32(/*C=*/0)),
CGF.MakeAddrLValue(CGF.ReturnValue, KmpInt32Ty));
CGF.FinishFunction();
return TaskEntry;
}
static llvm::Value *emitDestructorsFunction(CodeGenModule &CGM,
SourceLocation Loc,
QualType KmpInt32Ty,
QualType KmpTaskTWithPrivatesPtrQTy,
QualType KmpTaskTWithPrivatesQTy) {
auto &C = CGM.getContext();
FunctionArgList Args;
ImplicitParamDecl GtidArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, KmpInt32Ty);
ImplicitParamDecl TaskTypeArg(C, /*DC=*/nullptr, Loc,
/*Id=*/nullptr,
KmpTaskTWithPrivatesPtrQTy.withRestrict());
Args.push_back(&GtidArg);
Args.push_back(&TaskTypeArg);
FunctionType::ExtInfo Info;
auto &DestructorFnInfo =
CGM.getTypes().arrangeBuiltinFunctionDeclaration(KmpInt32Ty, Args);
auto *DestructorFnTy = CGM.getTypes().GetFunctionType(DestructorFnInfo);
auto *DestructorFn =
llvm::Function::Create(DestructorFnTy, llvm::GlobalValue::InternalLinkage,
".omp_task_destructor.", &CGM.getModule());
CGM.SetInternalFunctionAttributes(/*D=*/nullptr, DestructorFn,
DestructorFnInfo);
CodeGenFunction CGF(CGM);
CGF.disableDebugInfo();
CGF.StartFunction(GlobalDecl(), KmpInt32Ty, DestructorFn, DestructorFnInfo,
Args);
LValue Base = CGF.EmitLoadOfPointerLValue(
CGF.GetAddrOfLocalVar(&TaskTypeArg),
KmpTaskTWithPrivatesPtrQTy->castAs<PointerType>());
auto *KmpTaskTWithPrivatesQTyRD =
cast<RecordDecl>(KmpTaskTWithPrivatesQTy->getAsTagDecl());
auto FI = std::next(KmpTaskTWithPrivatesQTyRD->field_begin());
Base = CGF.EmitLValueForField(Base, *FI);
for (auto *Field :
cast<RecordDecl>(FI->getType()->getAsTagDecl())->fields()) {
if (auto DtorKind = Field->getType().isDestructedType()) {
auto FieldLValue = CGF.EmitLValueForField(Base, Field);
CGF.pushDestroy(DtorKind, FieldLValue.getAddress(), Field->getType());
}
}
CGF.FinishFunction();
return DestructorFn;
}
/// \brief Emit a privates mapping function for correct handling of private and
/// firstprivate variables.
/// \code
/// void .omp_task_privates_map.(const .privates. *noalias privs, <ty1>
/// **noalias priv1,..., <tyn> **noalias privn) {
/// *priv1 = &.privates.priv1;
/// ...;
/// *privn = &.privates.privn;
/// }
/// \endcode
static llvm::Value *
emitTaskPrivateMappingFunction(CodeGenModule &CGM, SourceLocation Loc,
ArrayRef<const Expr *> PrivateVars,
ArrayRef<const Expr *> FirstprivateVars,
ArrayRef<const Expr *> LastprivateVars,
QualType PrivatesQTy,
ArrayRef<PrivateDataTy> Privates) {
auto &C = CGM.getContext();
FunctionArgList Args;
ImplicitParamDecl TaskPrivatesArg(
C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
C.getPointerType(PrivatesQTy).withConst().withRestrict());
Args.push_back(&TaskPrivatesArg);
llvm::DenseMap<const VarDecl *, unsigned> PrivateVarsPos;
unsigned Counter = 1;
for (auto *E: PrivateVars) {
Args.push_back(ImplicitParamDecl::Create(
C, /*DC=*/nullptr, Loc,
/*Id=*/nullptr, C.getPointerType(C.getPointerType(E->getType()))
.withConst()
.withRestrict()));
auto *VD = cast<VarDecl>(cast<DeclRefExpr>(E)->getDecl());
PrivateVarsPos[VD] = Counter;
++Counter;
}
for (auto *E : FirstprivateVars) {
Args.push_back(ImplicitParamDecl::Create(
C, /*DC=*/nullptr, Loc,
/*Id=*/nullptr, C.getPointerType(C.getPointerType(E->getType()))
.withConst()
.withRestrict()));
auto *VD = cast<VarDecl>(cast<DeclRefExpr>(E)->getDecl());
PrivateVarsPos[VD] = Counter;
++Counter;
}
for (auto *E: LastprivateVars) {
Args.push_back(ImplicitParamDecl::Create(
C, /*DC=*/nullptr, Loc,
/*Id=*/nullptr, C.getPointerType(C.getPointerType(E->getType()))
.withConst()
.withRestrict()));
auto *VD = cast<VarDecl>(cast<DeclRefExpr>(E)->getDecl());
PrivateVarsPos[VD] = Counter;
++Counter;
}
auto &TaskPrivatesMapFnInfo =
CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
auto *TaskPrivatesMapTy =
CGM.getTypes().GetFunctionType(TaskPrivatesMapFnInfo);
auto *TaskPrivatesMap = llvm::Function::Create(
TaskPrivatesMapTy, llvm::GlobalValue::InternalLinkage,
".omp_task_privates_map.", &CGM.getModule());
CGM.SetInternalFunctionAttributes(/*D=*/nullptr, TaskPrivatesMap,
TaskPrivatesMapFnInfo);
TaskPrivatesMap->removeFnAttr(llvm::Attribute::NoInline);
TaskPrivatesMap->addFnAttr(llvm::Attribute::AlwaysInline);
CodeGenFunction CGF(CGM);
CGF.disableDebugInfo();
CGF.StartFunction(GlobalDecl(), C.VoidTy, TaskPrivatesMap,
TaskPrivatesMapFnInfo, Args);
// *privi = &.privates.privi;
LValue Base = CGF.EmitLoadOfPointerLValue(
CGF.GetAddrOfLocalVar(&TaskPrivatesArg),
TaskPrivatesArg.getType()->castAs<PointerType>());
auto *PrivatesQTyRD = cast<RecordDecl>(PrivatesQTy->getAsTagDecl());
Counter = 0;
for (auto *Field : PrivatesQTyRD->fields()) {
auto FieldLVal = CGF.EmitLValueForField(Base, Field);
auto *VD = Args[PrivateVarsPos[Privates[Counter].second.Original]];
auto RefLVal = CGF.MakeAddrLValue(CGF.GetAddrOfLocalVar(VD), VD->getType());
auto RefLoadLVal = CGF.EmitLoadOfPointerLValue(
RefLVal.getAddress(), RefLVal.getType()->castAs<PointerType>());
CGF.EmitStoreOfScalar(FieldLVal.getPointer(), RefLoadLVal);
++Counter;
}
CGF.FinishFunction();
return TaskPrivatesMap;
}
static int array_pod_sort_comparator(const PrivateDataTy *P1,
const PrivateDataTy *P2) {
return P1->first < P2->first ? 1 : (P2->first < P1->first ? -1 : 0);
}
/// Emit initialization for private variables in task-based directives.
static void emitPrivatesInit(CodeGenFunction &CGF,
const OMPExecutableDirective &D,
Address KmpTaskSharedsPtr, LValue TDBase,
const RecordDecl *KmpTaskTWithPrivatesQTyRD,
QualType SharedsTy, QualType SharedsPtrTy,
const OMPTaskDataTy &Data,
ArrayRef<PrivateDataTy> Privates, bool ForDup) {
auto &C = CGF.getContext();
auto FI = std::next(KmpTaskTWithPrivatesQTyRD->field_begin());
LValue PrivatesBase = CGF.EmitLValueForField(TDBase, *FI);
LValue SrcBase;
if (!Data.FirstprivateVars.empty()) {
SrcBase = CGF.MakeAddrLValue(
CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
KmpTaskSharedsPtr, CGF.ConvertTypeForMem(SharedsPtrTy)),
SharedsTy);
}
CodeGenFunction::CGCapturedStmtInfo CapturesInfo(
cast<CapturedStmt>(*D.getAssociatedStmt()));
FI = cast<RecordDecl>(FI->getType()->getAsTagDecl())->field_begin();
for (auto &&Pair : Privates) {
auto *VD = Pair.second.PrivateCopy;
auto *Init = VD->getAnyInitializer();
if (Init && (!ForDup || (isa<CXXConstructExpr>(Init) &&
!CGF.isTrivialInitializer(Init)))) {
LValue PrivateLValue = CGF.EmitLValueForField(PrivatesBase, *FI);
if (auto *Elem = Pair.second.PrivateElemInit) {
auto *OriginalVD = Pair.second.Original;
auto *SharedField = CapturesInfo.lookup(OriginalVD);
auto SharedRefLValue = CGF.EmitLValueForField(SrcBase, SharedField);
SharedRefLValue = CGF.MakeAddrLValue(
Address(SharedRefLValue.getPointer(), C.getDeclAlign(OriginalVD)),
SharedRefLValue.getType(), AlignmentSource::Decl);
QualType Type = OriginalVD->getType();
if (Type->isArrayType()) {
// Initialize firstprivate array.
if (!isa<CXXConstructExpr>(Init) || CGF.isTrivialInitializer(Init)) {
// Perform simple memcpy.
CGF.EmitAggregateAssign(PrivateLValue.getAddress(),
SharedRefLValue.getAddress(), Type);
} else {
// Initialize firstprivate array using element-by-element
// intialization.
CGF.EmitOMPAggregateAssign(
PrivateLValue.getAddress(), SharedRefLValue.getAddress(), Type,
[&CGF, Elem, Init, &CapturesInfo](Address DestElement,
Address SrcElement) {
// Clean up any temporaries needed by the initialization.
CodeGenFunction::OMPPrivateScope InitScope(CGF);
InitScope.addPrivate(
Elem, [SrcElement]() -> Address { return SrcElement; });
(void)InitScope.Privatize();
// Emit initialization for single element.
CodeGenFunction::CGCapturedStmtRAII CapInfoRAII(
CGF, &CapturesInfo);
CGF.EmitAnyExprToMem(Init, DestElement,
Init->getType().getQualifiers(),
/*IsInitializer=*/false);
});
}
} else {
CodeGenFunction::OMPPrivateScope InitScope(CGF);
InitScope.addPrivate(Elem, [SharedRefLValue]() -> Address {
return SharedRefLValue.getAddress();
});
(void)InitScope.Privatize();
CodeGenFunction::CGCapturedStmtRAII CapInfoRAII(CGF, &CapturesInfo);
CGF.EmitExprAsInit(Init, VD, PrivateLValue,
/*capturedByInit=*/false);
}
} else
CGF.EmitExprAsInit(Init, VD, PrivateLValue, /*capturedByInit=*/false);
}
++FI;
}
}
/// Check if duplication function is required for taskloops.
static bool checkInitIsRequired(CodeGenFunction &CGF,
ArrayRef<PrivateDataTy> Privates) {
bool InitRequired = false;
for (auto &&Pair : Privates) {
auto *VD = Pair.second.PrivateCopy;
auto *Init = VD->getAnyInitializer();
InitRequired = InitRequired || (Init && isa<CXXConstructExpr>(Init) &&
!CGF.isTrivialInitializer(Init));
}
return InitRequired;
}
/// Emit task_dup function (for initialization of
/// private/firstprivate/lastprivate vars and last_iter flag)
/// \code
/// void __task_dup_entry(kmp_task_t *task_dst, const kmp_task_t *task_src, int
/// lastpriv) {
/// // setup lastprivate flag
/// task_dst->last = lastpriv;
/// // could be constructor calls here...
/// }
/// \endcode
static llvm::Value *
emitTaskDupFunction(CodeGenModule &CGM, SourceLocation Loc,
const OMPExecutableDirective &D,
QualType KmpTaskTWithPrivatesPtrQTy,
const RecordDecl *KmpTaskTWithPrivatesQTyRD,
const RecordDecl *KmpTaskTQTyRD, QualType SharedsTy,
QualType SharedsPtrTy, const OMPTaskDataTy &Data,
ArrayRef<PrivateDataTy> Privates, bool WithLastIter) {
auto &C = CGM.getContext();
FunctionArgList Args;
ImplicitParamDecl DstArg(C, /*DC=*/nullptr, Loc,
/*Id=*/nullptr, KmpTaskTWithPrivatesPtrQTy);
ImplicitParamDecl SrcArg(C, /*DC=*/nullptr, Loc,
/*Id=*/nullptr, KmpTaskTWithPrivatesPtrQTy);
ImplicitParamDecl LastprivArg(C, /*DC=*/nullptr, Loc,
/*Id=*/nullptr, C.IntTy);
Args.push_back(&DstArg);
Args.push_back(&SrcArg);
Args.push_back(&LastprivArg);
auto &TaskDupFnInfo =
CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
auto *TaskDupTy = CGM.getTypes().GetFunctionType(TaskDupFnInfo);
auto *TaskDup =
llvm::Function::Create(TaskDupTy, llvm::GlobalValue::InternalLinkage,
".omp_task_dup.", &CGM.getModule());
CGM.SetInternalFunctionAttributes(/*D=*/nullptr, TaskDup, TaskDupFnInfo);
CodeGenFunction CGF(CGM);
CGF.disableDebugInfo();
CGF.StartFunction(GlobalDecl(), C.VoidTy, TaskDup, TaskDupFnInfo, Args);
LValue TDBase = CGF.EmitLoadOfPointerLValue(
CGF.GetAddrOfLocalVar(&DstArg),
KmpTaskTWithPrivatesPtrQTy->castAs<PointerType>());
// task_dst->liter = lastpriv;
if (WithLastIter) {
auto LIFI = std::next(KmpTaskTQTyRD->field_begin(), KmpTaskTLastIter);
LValue Base = CGF.EmitLValueForField(
TDBase, *KmpTaskTWithPrivatesQTyRD->field_begin());
LValue LILVal = CGF.EmitLValueForField(Base, *LIFI);
llvm::Value *Lastpriv = CGF.EmitLoadOfScalar(
CGF.GetAddrOfLocalVar(&LastprivArg), /*Volatile=*/false, C.IntTy, Loc);
CGF.EmitStoreOfScalar(Lastpriv, LILVal);
}
// Emit initial values for private copies (if any).
assert(!Privates.empty());
Address KmpTaskSharedsPtr = Address::invalid();
if (!Data.FirstprivateVars.empty()) {
LValue TDBase = CGF.EmitLoadOfPointerLValue(
CGF.GetAddrOfLocalVar(&SrcArg),
KmpTaskTWithPrivatesPtrQTy->castAs<PointerType>());
LValue Base = CGF.EmitLValueForField(
TDBase, *KmpTaskTWithPrivatesQTyRD->field_begin());
KmpTaskSharedsPtr = Address(
CGF.EmitLoadOfScalar(CGF.EmitLValueForField(
Base, *std::next(KmpTaskTQTyRD->field_begin(),
KmpTaskTShareds)),
Loc),
CGF.getNaturalTypeAlignment(SharedsTy));
}
emitPrivatesInit(CGF, D, KmpTaskSharedsPtr, TDBase, KmpTaskTWithPrivatesQTyRD,
SharedsTy, SharedsPtrTy, Data, Privates, /*ForDup=*/true);
CGF.FinishFunction();
return TaskDup;
}
/// Checks if destructor function is required to be generated.
/// \return true if cleanups are required, false otherwise.
static bool
checkDestructorsRequired(const RecordDecl *KmpTaskTWithPrivatesQTyRD) {
bool NeedsCleanup = false;
auto FI = std::next(KmpTaskTWithPrivatesQTyRD->field_begin());
auto *PrivateRD = cast<RecordDecl>(FI->getType()->getAsTagDecl());
for (auto *FD : PrivateRD->fields()) {
NeedsCleanup = NeedsCleanup || FD->getType().isDestructedType();
if (NeedsCleanup)
break;
}
return NeedsCleanup;
}
CGOpenMPRuntime::TaskResultTy
CGOpenMPRuntime::emitTaskInit(CodeGenFunction &CGF, SourceLocation Loc,
const OMPExecutableDirective &D,
llvm::Value *TaskFunction, QualType SharedsTy,
Address Shareds, const OMPTaskDataTy &Data) {
auto &C = CGM.getContext();
llvm::SmallVector<PrivateDataTy, 4> Privates;
// Aggregate privates and sort them by the alignment.
auto I = Data.PrivateCopies.begin();
for (auto *E : Data.PrivateVars) {
auto *VD = cast<VarDecl>(cast<DeclRefExpr>(E)->getDecl());
Privates.push_back(std::make_pair(
C.getDeclAlign(VD),
PrivateHelpersTy(VD, cast<VarDecl>(cast<DeclRefExpr>(*I)->getDecl()),
/*PrivateElemInit=*/nullptr)));
++I;
}
I = Data.FirstprivateCopies.begin();
auto IElemInitRef = Data.FirstprivateInits.begin();
for (auto *E : Data.FirstprivateVars) {
auto *VD = cast<VarDecl>(cast<DeclRefExpr>(E)->getDecl());
Privates.push_back(std::make_pair(
C.getDeclAlign(VD),
PrivateHelpersTy(
VD, cast<VarDecl>(cast<DeclRefExpr>(*I)->getDecl()),
cast<VarDecl>(cast<DeclRefExpr>(*IElemInitRef)->getDecl()))));
++I;
++IElemInitRef;
}
I = Data.LastprivateCopies.begin();
for (auto *E : Data.LastprivateVars) {
auto *VD = cast<VarDecl>(cast<DeclRefExpr>(E)->getDecl());
Privates.push_back(std::make_pair(
C.getDeclAlign(VD),
PrivateHelpersTy(VD, cast<VarDecl>(cast<DeclRefExpr>(*I)->getDecl()),
/*PrivateElemInit=*/nullptr)));
++I;
}
llvm::array_pod_sort(Privates.begin(), Privates.end(),
array_pod_sort_comparator);
auto KmpInt32Ty = C.getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/1);
// Build type kmp_routine_entry_t (if not built yet).
emitKmpRoutineEntryT(KmpInt32Ty);
// Build type kmp_task_t (if not built yet).
if (KmpTaskTQTy.isNull()) {
KmpTaskTQTy = C.getRecordType(createKmpTaskTRecordDecl(
CGM, D.getDirectiveKind(), KmpInt32Ty, KmpRoutineEntryPtrQTy));
}
auto *KmpTaskTQTyRD = cast<RecordDecl>(KmpTaskTQTy->getAsTagDecl());
// Build particular struct kmp_task_t for the given task.
auto *KmpTaskTWithPrivatesQTyRD =
createKmpTaskTWithPrivatesRecordDecl(CGM, KmpTaskTQTy, Privates);
auto KmpTaskTWithPrivatesQTy = C.getRecordType(KmpTaskTWithPrivatesQTyRD);
QualType KmpTaskTWithPrivatesPtrQTy =
C.getPointerType(KmpTaskTWithPrivatesQTy);
auto *KmpTaskTWithPrivatesTy = CGF.ConvertType(KmpTaskTWithPrivatesQTy);
auto *KmpTaskTWithPrivatesPtrTy = KmpTaskTWithPrivatesTy->getPointerTo();
auto *KmpTaskTWithPrivatesTySize = CGF.getTypeSize(KmpTaskTWithPrivatesQTy);
QualType SharedsPtrTy = C.getPointerType(SharedsTy);
// Emit initial values for private copies (if any).
llvm::Value *TaskPrivatesMap = nullptr;
auto *TaskPrivatesMapTy =
std::next(cast<llvm::Function>(TaskFunction)->getArgumentList().begin(),
3)
->getType();
if (!Privates.empty()) {
auto FI = std::next(KmpTaskTWithPrivatesQTyRD->field_begin());
TaskPrivatesMap = emitTaskPrivateMappingFunction(
CGM, Loc, Data.PrivateVars, Data.FirstprivateVars, Data.LastprivateVars,
FI->getType(), Privates);
TaskPrivatesMap = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
TaskPrivatesMap, TaskPrivatesMapTy);
} else {
TaskPrivatesMap = llvm::ConstantPointerNull::get(
cast<llvm::PointerType>(TaskPrivatesMapTy));
}
// Build a proxy function kmp_int32 .omp_task_entry.(kmp_int32 gtid,
// kmp_task_t *tt);
auto *TaskEntry = emitProxyTaskFunction(
CGM, Loc, D.getDirectiveKind(), KmpInt32Ty, KmpTaskTWithPrivatesPtrQTy,
KmpTaskTWithPrivatesQTy, KmpTaskTQTy, SharedsPtrTy, TaskFunction,
TaskPrivatesMap);
// Build call kmp_task_t * __kmpc_omp_task_alloc(ident_t *, kmp_int32 gtid,
// kmp_int32 flags, size_t sizeof_kmp_task_t, size_t sizeof_shareds,
// kmp_routine_entry_t *task_entry);
// Task flags. Format is taken from
// http://llvm.org/svn/llvm-project/openmp/trunk/runtime/src/kmp.h,
// description of kmp_tasking_flags struct.
enum {
TiedFlag = 0x1,
FinalFlag = 0x2,
DestructorsFlag = 0x8,
PriorityFlag = 0x20
};
unsigned Flags = Data.Tied ? TiedFlag : 0;
bool NeedsCleanup = false;
if (!Privates.empty()) {
NeedsCleanup = checkDestructorsRequired(KmpTaskTWithPrivatesQTyRD);
if (NeedsCleanup)
Flags = Flags | DestructorsFlag;
}
if (Data.Priority.getInt())
Flags = Flags | PriorityFlag;
auto *TaskFlags =
Data.Final.getPointer()
? CGF.Builder.CreateSelect(Data.Final.getPointer(),
CGF.Builder.getInt32(FinalFlag),
CGF.Builder.getInt32(/*C=*/0))
: CGF.Builder.getInt32(Data.Final.getInt() ? FinalFlag : 0);
TaskFlags = CGF.Builder.CreateOr(TaskFlags, CGF.Builder.getInt32(Flags));
auto *SharedsSize = CGM.getSize(C.getTypeSizeInChars(SharedsTy));
llvm::Value *AllocArgs[] = {emitUpdateLocation(CGF, Loc),
getThreadID(CGF, Loc), TaskFlags,
KmpTaskTWithPrivatesTySize, SharedsSize,
CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
TaskEntry, KmpRoutineEntryPtrTy)};
auto *NewTask = CGF.EmitRuntimeCall(
createRuntimeFunction(OMPRTL__kmpc_omp_task_alloc), AllocArgs);
auto *NewTaskNewTaskTTy = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
NewTask, KmpTaskTWithPrivatesPtrTy);
LValue Base = CGF.MakeNaturalAlignAddrLValue(NewTaskNewTaskTTy,
KmpTaskTWithPrivatesQTy);
LValue TDBase =
CGF.EmitLValueForField(Base, *KmpTaskTWithPrivatesQTyRD->field_begin());
// Fill the data in the resulting kmp_task_t record.
// Copy shareds if there are any.
Address KmpTaskSharedsPtr = Address::invalid();
if (!SharedsTy->getAsStructureType()->getDecl()->field_empty()) {
KmpTaskSharedsPtr =
Address(CGF.EmitLoadOfScalar(
CGF.EmitLValueForField(
TDBase, *std::next(KmpTaskTQTyRD->field_begin(),
KmpTaskTShareds)),
Loc),
CGF.getNaturalTypeAlignment(SharedsTy));
CGF.EmitAggregateCopy(KmpTaskSharedsPtr, Shareds, SharedsTy);
}
// Emit initial values for private copies (if any).
TaskResultTy Result;
if (!Privates.empty()) {
emitPrivatesInit(CGF, D, KmpTaskSharedsPtr, Base, KmpTaskTWithPrivatesQTyRD,
SharedsTy, SharedsPtrTy, Data, Privates,
/*ForDup=*/false);
if (isOpenMPTaskLoopDirective(D.getDirectiveKind()) &&
(!Data.LastprivateVars.empty() || checkInitIsRequired(CGF, Privates))) {
Result.TaskDupFn = emitTaskDupFunction(
CGM, Loc, D, KmpTaskTWithPrivatesPtrQTy, KmpTaskTWithPrivatesQTyRD,
KmpTaskTQTyRD, SharedsTy, SharedsPtrTy, Data, Privates,
/*WithLastIter=*/!Data.LastprivateVars.empty());
}
}
// Fields of union "kmp_cmplrdata_t" for destructors and priority.
enum { Priority = 0, Destructors = 1 };
// Provide pointer to function with destructors for privates.
auto FI = std::next(KmpTaskTQTyRD->field_begin(), Data1);
auto *KmpCmplrdataUD = (*FI)->getType()->getAsUnionType()->getDecl();
if (NeedsCleanup) {
llvm::Value *DestructorFn = emitDestructorsFunction(
CGM, Loc, KmpInt32Ty, KmpTaskTWithPrivatesPtrQTy,
KmpTaskTWithPrivatesQTy);
LValue Data1LV = CGF.EmitLValueForField(TDBase, *FI);
LValue DestructorsLV = CGF.EmitLValueForField(
Data1LV, *std::next(KmpCmplrdataUD->field_begin(), Destructors));
CGF.EmitStoreOfScalar(CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
DestructorFn, KmpRoutineEntryPtrTy),
DestructorsLV);
}
// Set priority.
if (Data.Priority.getInt()) {
LValue Data2LV = CGF.EmitLValueForField(
TDBase, *std::next(KmpTaskTQTyRD->field_begin(), Data2));
LValue PriorityLV = CGF.EmitLValueForField(
Data2LV, *std::next(KmpCmplrdataUD->field_begin(), Priority));
CGF.EmitStoreOfScalar(Data.Priority.getPointer(), PriorityLV);
}
Result.NewTask = NewTask;
Result.TaskEntry = TaskEntry;
Result.NewTaskNewTaskTTy = NewTaskNewTaskTTy;
Result.TDBase = TDBase;
Result.KmpTaskTQTyRD = KmpTaskTQTyRD;
return Result;
}
void CGOpenMPRuntime::emitTaskCall(CodeGenFunction &CGF, SourceLocation Loc,
const OMPExecutableDirective &D,
llvm::Value *TaskFunction,
QualType SharedsTy, Address Shareds,
const Expr *IfCond,
const OMPTaskDataTy &Data) {
if (!CGF.HaveInsertPoint())
return;
TaskResultTy Result =
emitTaskInit(CGF, Loc, D, TaskFunction, SharedsTy, Shareds, Data);
llvm::Value *NewTask = Result.NewTask;
llvm::Value *TaskEntry = Result.TaskEntry;
llvm::Value *NewTaskNewTaskTTy = Result.NewTaskNewTaskTTy;
LValue TDBase = Result.TDBase;
RecordDecl *KmpTaskTQTyRD = Result.KmpTaskTQTyRD;
auto &C = CGM.getContext();
// Process list of dependences.
Address DependenciesArray = Address::invalid();
unsigned NumDependencies = Data.Dependences.size();
if (NumDependencies) {
// Dependence kind for RTL.
enum RTLDependenceKindTy { DepIn = 0x01, DepInOut = 0x3 };
enum RTLDependInfoFieldsTy { BaseAddr, Len, Flags };
RecordDecl *KmpDependInfoRD;
QualType FlagsTy =
C.getIntTypeForBitwidth(C.getTypeSize(C.BoolTy), /*Signed=*/false);
llvm::Type *LLVMFlagsTy = CGF.ConvertTypeForMem(FlagsTy);
if (KmpDependInfoTy.isNull()) {
KmpDependInfoRD = C.buildImplicitRecord("kmp_depend_info");
KmpDependInfoRD->startDefinition();
addFieldToRecordDecl(C, KmpDependInfoRD, C.getIntPtrType());
addFieldToRecordDecl(C, KmpDependInfoRD, C.getSizeType());
addFieldToRecordDecl(C, KmpDependInfoRD, FlagsTy);
KmpDependInfoRD->completeDefinition();
KmpDependInfoTy = C.getRecordType(KmpDependInfoRD);
} else
KmpDependInfoRD = cast<RecordDecl>(KmpDependInfoTy->getAsTagDecl());
CharUnits DependencySize = C.getTypeSizeInChars(KmpDependInfoTy);
// Define type kmp_depend_info[<Dependences.size()>];
QualType KmpDependInfoArrayTy = C.getConstantArrayType(
KmpDependInfoTy, llvm::APInt(/*numBits=*/64, NumDependencies),
ArrayType::Normal, /*IndexTypeQuals=*/0);
// kmp_depend_info[<Dependences.size()>] deps;
DependenciesArray =
CGF.CreateMemTemp(KmpDependInfoArrayTy, ".dep.arr.addr");
for (unsigned i = 0; i < NumDependencies; ++i) {
const Expr *E = Data.Dependences[i].second;
auto Addr = CGF.EmitLValue(E);
llvm::Value *Size;
QualType Ty = E->getType();
if (auto *ASE = dyn_cast<OMPArraySectionExpr>(E->IgnoreParenImpCasts())) {
LValue UpAddrLVal =
CGF.EmitOMPArraySectionExpr(ASE, /*LowerBound=*/false);
llvm::Value *UpAddr =
CGF.Builder.CreateConstGEP1_32(UpAddrLVal.getPointer(), /*Idx0=*/1);
llvm::Value *LowIntPtr =
CGF.Builder.CreatePtrToInt(Addr.getPointer(), CGM.SizeTy);
llvm::Value *UpIntPtr = CGF.Builder.CreatePtrToInt(UpAddr, CGM.SizeTy);
Size = CGF.Builder.CreateNUWSub(UpIntPtr, LowIntPtr);
} else
Size = CGF.getTypeSize(Ty);
auto Base = CGF.MakeAddrLValue(
CGF.Builder.CreateConstArrayGEP(DependenciesArray, i, DependencySize),
KmpDependInfoTy);
// deps[i].base_addr = &<Dependences[i].second>;
auto BaseAddrLVal = CGF.EmitLValueForField(
Base, *std::next(KmpDependInfoRD->field_begin(), BaseAddr));
CGF.EmitStoreOfScalar(
CGF.Builder.CreatePtrToInt(Addr.getPointer(), CGF.IntPtrTy),
BaseAddrLVal);
// deps[i].len = sizeof(<Dependences[i].second>);
auto LenLVal = CGF.EmitLValueForField(
Base, *std::next(KmpDependInfoRD->field_begin(), Len));
CGF.EmitStoreOfScalar(Size, LenLVal);
// deps[i].flags = <Dependences[i].first>;
RTLDependenceKindTy DepKind;
switch (Data.Dependences[i].first) {
case OMPC_DEPEND_in:
DepKind = DepIn;
break;
// Out and InOut dependencies must use the same code.
case OMPC_DEPEND_out:
case OMPC_DEPEND_inout:
DepKind = DepInOut;
break;
case OMPC_DEPEND_source:
case OMPC_DEPEND_sink:
case OMPC_DEPEND_unknown:
llvm_unreachable("Unknown task dependence type");
}
auto FlagsLVal = CGF.EmitLValueForField(
Base, *std::next(KmpDependInfoRD->field_begin(), Flags));
CGF.EmitStoreOfScalar(llvm::ConstantInt::get(LLVMFlagsTy, DepKind),
FlagsLVal);
}
DependenciesArray = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
CGF.Builder.CreateStructGEP(DependenciesArray, 0, CharUnits::Zero()),
CGF.VoidPtrTy);
}
// NOTE: routine and part_id fields are intialized by __kmpc_omp_task_alloc()
// libcall.
// Build kmp_int32 __kmpc_omp_task_with_deps(ident_t *, kmp_int32 gtid,
// kmp_task_t *new_task, kmp_int32 ndeps, kmp_depend_info_t *dep_list,
// kmp_int32 ndeps_noalias, kmp_depend_info_t *noalias_dep_list) if dependence
// list is not empty
auto *ThreadID = getThreadID(CGF, Loc);
auto *UpLoc = emitUpdateLocation(CGF, Loc);
llvm::Value *TaskArgs[] = { UpLoc, ThreadID, NewTask };
llvm::Value *DepTaskArgs[7];
if (NumDependencies) {
DepTaskArgs[0] = UpLoc;
DepTaskArgs[1] = ThreadID;
DepTaskArgs[2] = NewTask;
DepTaskArgs[3] = CGF.Builder.getInt32(NumDependencies);
DepTaskArgs[4] = DependenciesArray.getPointer();
DepTaskArgs[5] = CGF.Builder.getInt32(0);
DepTaskArgs[6] = llvm::ConstantPointerNull::get(CGF.VoidPtrTy);
}
auto &&ThenCodeGen = [this, Loc, &Data, TDBase, KmpTaskTQTyRD,
NumDependencies, &TaskArgs,
&DepTaskArgs](CodeGenFunction &CGF, PrePostActionTy &) {
if (!Data.Tied) {
auto PartIdFI = std::next(KmpTaskTQTyRD->field_begin(), KmpTaskTPartId);
auto PartIdLVal = CGF.EmitLValueForField(TDBase, *PartIdFI);
CGF.EmitStoreOfScalar(CGF.Builder.getInt32(0), PartIdLVal);
}
if (NumDependencies) {
CGF.EmitRuntimeCall(
createRuntimeFunction(OMPRTL__kmpc_omp_task_with_deps), DepTaskArgs);
} else {
CGF.EmitRuntimeCall(createRuntimeFunction(OMPRTL__kmpc_omp_task),
TaskArgs);
}
// Check if parent region is untied and build return for untied task;
if (auto *Region =
dyn_cast_or_null<CGOpenMPRegionInfo>(CGF.CapturedStmtInfo))
Region->emitUntiedSwitch(CGF);
};
llvm::Value *DepWaitTaskArgs[6];
if (NumDependencies) {
DepWaitTaskArgs[0] = UpLoc;
DepWaitTaskArgs[1] = ThreadID;
DepWaitTaskArgs[2] = CGF.Builder.getInt32(NumDependencies);
DepWaitTaskArgs[3] = DependenciesArray.getPointer();
DepWaitTaskArgs[4] = CGF.Builder.getInt32(0);
DepWaitTaskArgs[5] = llvm::ConstantPointerNull::get(CGF.VoidPtrTy);
}
auto &&ElseCodeGen = [&TaskArgs, ThreadID, NewTaskNewTaskTTy, TaskEntry,
NumDependencies, &DepWaitTaskArgs](CodeGenFunction &CGF,
PrePostActionTy &) {
auto &RT = CGF.CGM.getOpenMPRuntime();
CodeGenFunction::RunCleanupsScope LocalScope(CGF);
// Build void __kmpc_omp_wait_deps(ident_t *, kmp_int32 gtid,
// kmp_int32 ndeps, kmp_depend_info_t *dep_list, kmp_int32
// ndeps_noalias, kmp_depend_info_t *noalias_dep_list); if dependence info
// is specified.
if (NumDependencies)
CGF.EmitRuntimeCall(RT.createRuntimeFunction(OMPRTL__kmpc_omp_wait_deps),
DepWaitTaskArgs);
// Call proxy_task_entry(gtid, new_task);
auto &&CodeGen = [TaskEntry, ThreadID, NewTaskNewTaskTTy](
CodeGenFunction &CGF, PrePostActionTy &Action) {
Action.Enter(CGF);
llvm::Value *OutlinedFnArgs[] = {ThreadID, NewTaskNewTaskTTy};
CGF.EmitCallOrInvoke(TaskEntry, OutlinedFnArgs);
};
// Build void __kmpc_omp_task_begin_if0(ident_t *, kmp_int32 gtid,
// kmp_task_t *new_task);
// Build void __kmpc_omp_task_complete_if0(ident_t *, kmp_int32 gtid,
// kmp_task_t *new_task);
RegionCodeGenTy RCG(CodeGen);
CommonActionTy Action(
RT.createRuntimeFunction(OMPRTL__kmpc_omp_task_begin_if0), TaskArgs,
RT.createRuntimeFunction(OMPRTL__kmpc_omp_task_complete_if0), TaskArgs);
RCG.setAction(Action);
RCG(CGF);
};
if (IfCond)
emitOMPIfClause(CGF, IfCond, ThenCodeGen, ElseCodeGen);
else {
RegionCodeGenTy ThenRCG(ThenCodeGen);
ThenRCG(CGF);
}
}
void CGOpenMPRuntime::emitTaskLoopCall(CodeGenFunction &CGF, SourceLocation Loc,
const OMPLoopDirective &D,
llvm::Value *TaskFunction,
QualType SharedsTy, Address Shareds,
const Expr *IfCond,
const OMPTaskDataTy &Data) {
if (!CGF.HaveInsertPoint())
return;
TaskResultTy Result =
emitTaskInit(CGF, Loc, D, TaskFunction, SharedsTy, Shareds, Data);
// NOTE: routine and part_id fields are intialized by __kmpc_omp_task_alloc()
// libcall.
// Call to void __kmpc_taskloop(ident_t *loc, int gtid, kmp_task_t *task, int
// if_val, kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st, int nogroup, int
// sched, kmp_uint64 grainsize, void *task_dup);
llvm::Value *ThreadID = getThreadID(CGF, Loc);
llvm::Value *UpLoc = emitUpdateLocation(CGF, Loc);
llvm::Value *IfVal;
if (IfCond) {
IfVal = CGF.Builder.CreateIntCast(CGF.EvaluateExprAsBool(IfCond), CGF.IntTy,
/*isSigned=*/true);
} else
IfVal = llvm::ConstantInt::getSigned(CGF.IntTy, /*V=*/1);
LValue LBLVal = CGF.EmitLValueForField(
Result.TDBase,
*std::next(Result.KmpTaskTQTyRD->field_begin(), KmpTaskTLowerBound));
auto *LBVar =
cast<VarDecl>(cast<DeclRefExpr>(D.getLowerBoundVariable())->getDecl());
CGF.EmitAnyExprToMem(LBVar->getInit(), LBLVal.getAddress(), LBLVal.getQuals(),
/*IsInitializer=*/true);
LValue UBLVal = CGF.EmitLValueForField(
Result.TDBase,
*std::next(Result.KmpTaskTQTyRD->field_begin(), KmpTaskTUpperBound));
auto *UBVar =
cast<VarDecl>(cast<DeclRefExpr>(D.getUpperBoundVariable())->getDecl());
CGF.EmitAnyExprToMem(UBVar->getInit(), UBLVal.getAddress(), UBLVal.getQuals(),
/*IsInitializer=*/true);
LValue StLVal = CGF.EmitLValueForField(
Result.TDBase,
*std::next(Result.KmpTaskTQTyRD->field_begin(), KmpTaskTStride));
auto *StVar =
cast<VarDecl>(cast<DeclRefExpr>(D.getStrideVariable())->getDecl());
CGF.EmitAnyExprToMem(StVar->getInit(), StLVal.getAddress(), StLVal.getQuals(),
/*IsInitializer=*/true);
enum { NoSchedule = 0, Grainsize = 1, NumTasks = 2 };
llvm::Value *TaskArgs[] = {
UpLoc, ThreadID, Result.NewTask, IfVal, LBLVal.getPointer(),
UBLVal.getPointer(), CGF.EmitLoadOfScalar(StLVal, SourceLocation()),
llvm::ConstantInt::getSigned(CGF.IntTy, Data.Nogroup ? 1 : 0),
llvm::ConstantInt::getSigned(
CGF.IntTy, Data.Schedule.getPointer()
? Data.Schedule.getInt() ? NumTasks : Grainsize
: NoSchedule),
Data.Schedule.getPointer()
? CGF.Builder.CreateIntCast(Data.Schedule.getPointer(), CGF.Int64Ty,
/*isSigned=*/false)
: llvm::ConstantInt::get(CGF.Int64Ty, /*V=*/0),
Result.TaskDupFn
? CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(Result.TaskDupFn,
CGF.VoidPtrTy)
: llvm::ConstantPointerNull::get(CGF.VoidPtrTy)};
CGF.EmitRuntimeCall(createRuntimeFunction(OMPRTL__kmpc_taskloop), TaskArgs);
}
/// \brief Emit reduction operation for each element of array (required for
/// array sections) LHS op = RHS.
/// \param Type Type of array.
/// \param LHSVar Variable on the left side of the reduction operation
/// (references element of array in original variable).
/// \param RHSVar Variable on the right side of the reduction operation
/// (references element of array in original variable).
/// \param RedOpGen Generator of reduction operation with use of LHSVar and
/// RHSVar.
static void EmitOMPAggregateReduction(
CodeGenFunction &CGF, QualType Type, const VarDecl *LHSVar,
const VarDecl *RHSVar,
const llvm::function_ref<void(CodeGenFunction &CGF, const Expr *,
const Expr *, const Expr *)> &RedOpGen,
const Expr *XExpr = nullptr, const Expr *EExpr = nullptr,
const Expr *UpExpr = nullptr) {
// Perform element-by-element initialization.
QualType ElementTy;
Address LHSAddr = CGF.GetAddrOfLocalVar(LHSVar);
Address RHSAddr = CGF.GetAddrOfLocalVar(RHSVar);
// Drill down to the base element type on both arrays.
auto ArrayTy = Type->getAsArrayTypeUnsafe();
auto NumElements = CGF.emitArrayLength(ArrayTy, ElementTy, LHSAddr);
auto RHSBegin = RHSAddr.getPointer();
auto LHSBegin = LHSAddr.getPointer();
// Cast from pointer to array type to pointer to single element.
auto LHSEnd = CGF.Builder.CreateGEP(LHSBegin, NumElements);
// The basic structure here is a while-do loop.
auto BodyBB = CGF.createBasicBlock("omp.arraycpy.body");
auto DoneBB = CGF.createBasicBlock("omp.arraycpy.done");
auto IsEmpty =
CGF.Builder.CreateICmpEQ(LHSBegin, LHSEnd, "omp.arraycpy.isempty");
CGF.Builder.CreateCondBr(IsEmpty, DoneBB, BodyBB);
// Enter the loop body, making that address the current address.
auto EntryBB = CGF.Builder.GetInsertBlock();
CGF.EmitBlock(BodyBB);
CharUnits ElementSize = CGF.getContext().getTypeSizeInChars(ElementTy);
llvm::PHINode *RHSElementPHI = CGF.Builder.CreatePHI(
RHSBegin->getType(), 2, "omp.arraycpy.srcElementPast");
RHSElementPHI->addIncoming(RHSBegin, EntryBB);
Address RHSElementCurrent =
Address(RHSElementPHI,
RHSAddr.getAlignment().alignmentOfArrayElement(ElementSize));
llvm::PHINode *LHSElementPHI = CGF.Builder.CreatePHI(
LHSBegin->getType(), 2, "omp.arraycpy.destElementPast");
LHSElementPHI->addIncoming(LHSBegin, EntryBB);
Address LHSElementCurrent =
Address(LHSElementPHI,
LHSAddr.getAlignment().alignmentOfArrayElement(ElementSize));
// Emit copy.
CodeGenFunction::OMPPrivateScope Scope(CGF);
Scope.addPrivate(LHSVar, [=]() -> Address { return LHSElementCurrent; });
Scope.addPrivate(RHSVar, [=]() -> Address { return RHSElementCurrent; });
Scope.Privatize();
RedOpGen(CGF, XExpr, EExpr, UpExpr);
Scope.ForceCleanup();
// Shift the address forward by one element.
auto LHSElementNext = CGF.Builder.CreateConstGEP1_32(
LHSElementPHI, /*Idx0=*/1, "omp.arraycpy.dest.element");
auto RHSElementNext = CGF.Builder.CreateConstGEP1_32(
RHSElementPHI, /*Idx0=*/1, "omp.arraycpy.src.element");
// Check whether we've reached the end.
auto Done =
CGF.Builder.CreateICmpEQ(LHSElementNext, LHSEnd, "omp.arraycpy.done");
CGF.Builder.CreateCondBr(Done, DoneBB, BodyBB);
LHSElementPHI->addIncoming(LHSElementNext, CGF.Builder.GetInsertBlock());
RHSElementPHI->addIncoming(RHSElementNext, CGF.Builder.GetInsertBlock());
// Done.
CGF.EmitBlock(DoneBB, /*IsFinished=*/true);
}
/// Emit reduction combiner. If the combiner is a simple expression emit it as
/// is, otherwise consider it as combiner of UDR decl and emit it as a call of
/// UDR combiner function.
static void emitReductionCombiner(CodeGenFunction &CGF,
const Expr *ReductionOp) {
if (auto *CE = dyn_cast<CallExpr>(ReductionOp))
if (auto *OVE = dyn_cast<OpaqueValueExpr>(CE->getCallee()))
if (auto *DRE =
dyn_cast<DeclRefExpr>(OVE->getSourceExpr()->IgnoreImpCasts()))
if (auto *DRD = dyn_cast<OMPDeclareReductionDecl>(DRE->getDecl())) {
std::pair<llvm::Function *, llvm::Function *> Reduction =
CGF.CGM.getOpenMPRuntime().getUserDefinedReduction(DRD);
RValue Func = RValue::get(Reduction.first);
CodeGenFunction::OpaqueValueMapping Map(CGF, OVE, Func);
CGF.EmitIgnoredExpr(ReductionOp);
return;
}
CGF.EmitIgnoredExpr(ReductionOp);
}
static llvm::Value *emitReductionFunction(CodeGenModule &CGM,
llvm::Type *ArgsType,
ArrayRef<const Expr *> Privates,
ArrayRef<const Expr *> LHSExprs,
ArrayRef<const Expr *> RHSExprs,
ArrayRef<const Expr *> ReductionOps) {
auto &C = CGM.getContext();
// void reduction_func(void *LHSArg, void *RHSArg);
FunctionArgList Args;
ImplicitParamDecl LHSArg(C, /*DC=*/nullptr, SourceLocation(), /*Id=*/nullptr,
C.VoidPtrTy);
ImplicitParamDecl RHSArg(C, /*DC=*/nullptr, SourceLocation(), /*Id=*/nullptr,
C.VoidPtrTy);
Args.push_back(&LHSArg);
Args.push_back(&RHSArg);
auto &CGFI = CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
auto *Fn = llvm::Function::Create(
CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
".omp.reduction.reduction_func", &CGM.getModule());
CGM.SetInternalFunctionAttributes(/*D=*/nullptr, Fn, CGFI);
CodeGenFunction CGF(CGM);
CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args);
// Dst = (void*[n])(LHSArg);
// Src = (void*[n])(RHSArg);
Address LHS(CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
CGF.Builder.CreateLoad(CGF.GetAddrOfLocalVar(&LHSArg)),
ArgsType), CGF.getPointerAlign());
Address RHS(CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
CGF.Builder.CreateLoad(CGF.GetAddrOfLocalVar(&RHSArg)),
ArgsType), CGF.getPointerAlign());
// ...
// *(Type<i>*)lhs[i] = RedOp<i>(*(Type<i>*)lhs[i], *(Type<i>*)rhs[i]);
// ...
CodeGenFunction::OMPPrivateScope Scope(CGF);
auto IPriv = Privates.begin();
unsigned Idx = 0;
for (unsigned I = 0, E = ReductionOps.size(); I < E; ++I, ++IPriv, ++Idx) {
auto RHSVar = cast<VarDecl>(cast<DeclRefExpr>(RHSExprs[I])->getDecl());
Scope.addPrivate(RHSVar, [&]() -> Address {
return emitAddrOfVarFromArray(CGF, RHS, Idx, RHSVar);
});
auto LHSVar = cast<VarDecl>(cast<DeclRefExpr>(LHSExprs[I])->getDecl());
Scope.addPrivate(LHSVar, [&]() -> Address {
return emitAddrOfVarFromArray(CGF, LHS, Idx, LHSVar);
});
QualType PrivTy = (*IPriv)->getType();
if (PrivTy->isVariablyModifiedType()) {
// Get array size and emit VLA type.
++Idx;
Address Elem =
CGF.Builder.CreateConstArrayGEP(LHS, Idx, CGF.getPointerSize());
llvm::Value *Ptr = CGF.Builder.CreateLoad(Elem);
auto *VLA = CGF.getContext().getAsVariableArrayType(PrivTy);
auto *OVE = cast<OpaqueValueExpr>(VLA->getSizeExpr());
CodeGenFunction::OpaqueValueMapping OpaqueMap(
CGF, OVE, RValue::get(CGF.Builder.CreatePtrToInt(Ptr, CGF.SizeTy)));
CGF.EmitVariablyModifiedType(PrivTy);
}
}
Scope.Privatize();
IPriv = Privates.begin();
auto ILHS = LHSExprs.begin();
auto IRHS = RHSExprs.begin();
for (auto *E : ReductionOps) {
if ((*IPriv)->getType()->isArrayType()) {
// Emit reduction for array section.
auto *LHSVar = cast<VarDecl>(cast<DeclRefExpr>(*ILHS)->getDecl());
auto *RHSVar = cast<VarDecl>(cast<DeclRefExpr>(*IRHS)->getDecl());
EmitOMPAggregateReduction(
CGF, (*IPriv)->getType(), LHSVar, RHSVar,
[=](CodeGenFunction &CGF, const Expr *, const Expr *, const Expr *) {
emitReductionCombiner(CGF, E);
});
} else
// Emit reduction for array subscript or single variable.
emitReductionCombiner(CGF, E);
++IPriv;
++ILHS;
++IRHS;
}
Scope.ForceCleanup();
CGF.FinishFunction();
return Fn;
}
static void emitSingleReductionCombiner(CodeGenFunction &CGF,
const Expr *ReductionOp,
const Expr *PrivateRef,
const DeclRefExpr *LHS,
const DeclRefExpr *RHS) {
if (PrivateRef->getType()->isArrayType()) {
// Emit reduction for array section.
auto *LHSVar = cast<VarDecl>(LHS->getDecl());
auto *RHSVar = cast<VarDecl>(RHS->getDecl());
EmitOMPAggregateReduction(
CGF, PrivateRef->getType(), LHSVar, RHSVar,
[=](CodeGenFunction &CGF, const Expr *, const Expr *, const Expr *) {
emitReductionCombiner(CGF, ReductionOp);
});
} else
// Emit reduction for array subscript or single variable.
emitReductionCombiner(CGF, ReductionOp);
}
void CGOpenMPRuntime::emitReduction(CodeGenFunction &CGF, SourceLocation Loc,
ArrayRef<const Expr *> Privates,
ArrayRef<const Expr *> LHSExprs,
ArrayRef<const Expr *> RHSExprs,
ArrayRef<const Expr *> ReductionOps,
bool WithNowait, bool SimpleReduction) {
if (!CGF.HaveInsertPoint())
return;
// Next code should be emitted for reduction:
//
// static kmp_critical_name lock = { 0 };
//
// void reduce_func(void *lhs[<n>], void *rhs[<n>]) {
// *(Type0*)lhs[0] = ReductionOperation0(*(Type0*)lhs[0], *(Type0*)rhs[0]);
// ...
// *(Type<n>-1*)lhs[<n>-1] = ReductionOperation<n>-1(*(Type<n>-1*)lhs[<n>-1],
// *(Type<n>-1*)rhs[<n>-1]);
// }
//
// ...
// void *RedList[<n>] = {&<RHSExprs>[0], ..., &<RHSExprs>[<n>-1]};
// switch (__kmpc_reduce{_nowait}(<loc>, <gtid>, <n>, sizeof(RedList),
// RedList, reduce_func, &<lock>)) {
// case 1:
// ...
// <LHSExprs>[i] = RedOp<i>(*<LHSExprs>[i], *<RHSExprs>[i]);
// ...
// __kmpc_end_reduce{_nowait}(<loc>, <gtid>, &<lock>);
// break;
// case 2:
// ...
// Atomic(<LHSExprs>[i] = RedOp<i>(*<LHSExprs>[i], *<RHSExprs>[i]));
// ...
// [__kmpc_end_reduce(<loc>, <gtid>, &<lock>);]
// break;
// default:;
// }
//
// if SimpleReduction is true, only the next code is generated:
// ...
// <LHSExprs>[i] = RedOp<i>(*<LHSExprs>[i], *<RHSExprs>[i]);
// ...
auto &C = CGM.getContext();
if (SimpleReduction) {
CodeGenFunction::RunCleanupsScope Scope(CGF);
auto IPriv = Privates.begin();
auto ILHS = LHSExprs.begin();
auto IRHS = RHSExprs.begin();
for (auto *E : ReductionOps) {
emitSingleReductionCombiner(CGF, E, *IPriv, cast<DeclRefExpr>(*ILHS),
cast<DeclRefExpr>(*IRHS));
++IPriv;
++ILHS;
++IRHS;
}
return;
}
// 1. Build a list of reduction variables.
// void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
auto Size = RHSExprs.size();
for (auto *E : Privates) {
if (E->getType()->isVariablyModifiedType())
// Reserve place for array size.
++Size;
}
llvm::APInt ArraySize(/*unsigned int numBits=*/32, Size);
QualType ReductionArrayTy =
C.getConstantArrayType(C.VoidPtrTy, ArraySize, ArrayType::Normal,
/*IndexTypeQuals=*/0);
Address ReductionList =
CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list");
auto IPriv = Privates.begin();
unsigned Idx = 0;
for (unsigned I = 0, E = RHSExprs.size(); I < E; ++I, ++IPriv, ++Idx) {
Address Elem =
CGF.Builder.CreateConstArrayGEP(ReductionList, Idx, CGF.getPointerSize());
CGF.Builder.CreateStore(
CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
CGF.EmitLValue(RHSExprs[I]).getPointer(), CGF.VoidPtrTy),
Elem);
if ((*IPriv)->getType()->isVariablyModifiedType()) {
// Store array size.
++Idx;
Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx,
CGF.getPointerSize());
llvm::Value *Size = CGF.Builder.CreateIntCast(
CGF.getVLASize(
CGF.getContext().getAsVariableArrayType((*IPriv)->getType()))
.first,
CGF.SizeTy, /*isSigned=*/false);
CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(Size, CGF.VoidPtrTy),
Elem);
}
}
// 2. Emit reduce_func().
auto *ReductionFn = emitReductionFunction(
CGM, CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo(), Privates,
LHSExprs, RHSExprs, ReductionOps);
// 3. Create static kmp_critical_name lock = { 0 };
auto *Lock = getCriticalRegionLock(".reduction");
// 4. Build res = __kmpc_reduce{_nowait}(<loc>, <gtid>, <n>, sizeof(RedList),
// RedList, reduce_func, &<lock>);
auto *IdentTLoc = emitUpdateLocation(CGF, Loc, OMP_ATOMIC_REDUCE);
auto *ThreadId = getThreadID(CGF, Loc);
auto *ReductionArrayTySize = CGF.getTypeSize(ReductionArrayTy);
auto *RL = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
ReductionList.getPointer(), CGF.VoidPtrTy);
llvm::Value *Args[] = {
IdentTLoc, // ident_t *<loc>
ThreadId, // i32 <gtid>
CGF.Builder.getInt32(RHSExprs.size()), // i32 <n>
ReductionArrayTySize, // size_type sizeof(RedList)
RL, // void *RedList
ReductionFn, // void (*) (void *, void *) <reduce_func>
Lock // kmp_critical_name *&<lock>
};
auto Res = CGF.EmitRuntimeCall(
createRuntimeFunction(WithNowait ? OMPRTL__kmpc_reduce_nowait
: OMPRTL__kmpc_reduce),
Args);
// 5. Build switch(res)
auto *DefaultBB = CGF.createBasicBlock(".omp.reduction.default");
auto *SwInst = CGF.Builder.CreateSwitch(Res, DefaultBB, /*NumCases=*/2);
// 6. Build case 1:
// ...
// <LHSExprs>[i] = RedOp<i>(*<LHSExprs>[i], *<RHSExprs>[i]);
// ...
// __kmpc_end_reduce{_nowait}(<loc>, <gtid>, &<lock>);
// break;
auto *Case1BB = CGF.createBasicBlock(".omp.reduction.case1");
SwInst->addCase(CGF.Builder.getInt32(1), Case1BB);
CGF.EmitBlock(Case1BB);
// Add emission of __kmpc_end_reduce{_nowait}(<loc>, <gtid>, &<lock>);
llvm::Value *EndArgs[] = {
IdentTLoc, // ident_t *<loc>
ThreadId, // i32 <gtid>
Lock // kmp_critical_name *&<lock>
};
auto &&CodeGen = [&Privates, &LHSExprs, &RHSExprs, &ReductionOps](
CodeGenFunction &CGF, PrePostActionTy &Action) {
auto IPriv = Privates.begin();
auto ILHS = LHSExprs.begin();
auto IRHS = RHSExprs.begin();
for (auto *E : ReductionOps) {
emitSingleReductionCombiner(CGF, E, *IPriv, cast<DeclRefExpr>(*ILHS),
cast<DeclRefExpr>(*IRHS));
++IPriv;
++ILHS;
++IRHS;
}
};
RegionCodeGenTy RCG(CodeGen);
CommonActionTy Action(
nullptr, llvm::None,
createRuntimeFunction(WithNowait ? OMPRTL__kmpc_end_reduce_nowait
: OMPRTL__kmpc_end_reduce),
EndArgs);
RCG.setAction(Action);
RCG(CGF);
CGF.EmitBranch(DefaultBB);
// 7. Build case 2:
// ...
// Atomic(<LHSExprs>[i] = RedOp<i>(*<LHSExprs>[i], *<RHSExprs>[i]));
// ...
// break;
auto *Case2BB = CGF.createBasicBlock(".omp.reduction.case2");
SwInst->addCase(CGF.Builder.getInt32(2), Case2BB);
CGF.EmitBlock(Case2BB);
auto &&AtomicCodeGen = [Loc, &Privates, &LHSExprs, &RHSExprs, &ReductionOps](
CodeGenFunction &CGF, PrePostActionTy &Action) {
auto ILHS = LHSExprs.begin();
auto IRHS = RHSExprs.begin();
auto IPriv = Privates.begin();
for (auto *E : ReductionOps) {
const Expr *XExpr = nullptr;
const Expr *EExpr = nullptr;
const Expr *UpExpr = nullptr;
BinaryOperatorKind BO = BO_Comma;
if (auto *BO = dyn_cast<BinaryOperator>(E)) {
if (BO->getOpcode() == BO_Assign) {
XExpr = BO->getLHS();
UpExpr = BO->getRHS();
}
}
// Try to emit update expression as a simple atomic.
auto *RHSExpr = UpExpr;
if (RHSExpr) {
// Analyze RHS part of the whole expression.
if (auto *ACO = dyn_cast<AbstractConditionalOperator>(
RHSExpr->IgnoreParenImpCasts())) {
// If this is a conditional operator, analyze its condition for
// min/max reduction operator.
RHSExpr = ACO->getCond();
}
if (auto *BORHS =
dyn_cast<BinaryOperator>(RHSExpr->IgnoreParenImpCasts())) {
EExpr = BORHS->getRHS();
BO = BORHS->getOpcode();
}
}
if (XExpr) {
auto *VD = cast<VarDecl>(cast<DeclRefExpr>(*ILHS)->getDecl());
auto &&AtomicRedGen = [BO, VD, IPriv,
Loc](CodeGenFunction &CGF, const Expr *XExpr,
const Expr *EExpr, const Expr *UpExpr) {
LValue X = CGF.EmitLValue(XExpr);
RValue E;
if (EExpr)
E = CGF.EmitAnyExpr(EExpr);
CGF.EmitOMPAtomicSimpleUpdateExpr(
X, E, BO, /*IsXLHSInRHSPart=*/true,
llvm::AtomicOrdering::Monotonic, Loc,
[&CGF, UpExpr, VD, IPriv, Loc](RValue XRValue) {
CodeGenFunction::OMPPrivateScope PrivateScope(CGF);
PrivateScope.addPrivate(
VD, [&CGF, VD, XRValue, Loc]() -> Address {
Address LHSTemp = CGF.CreateMemTemp(VD->getType());
CGF.emitOMPSimpleStore(
CGF.MakeAddrLValue(LHSTemp, VD->getType()), XRValue,
VD->getType().getNonReferenceType(), Loc);
return LHSTemp;
});
(void)PrivateScope.Privatize();
return CGF.EmitAnyExpr(UpExpr);
});
};
if ((*IPriv)->getType()->isArrayType()) {
// Emit atomic reduction for array section.
auto *RHSVar = cast<VarDecl>(cast<DeclRefExpr>(*IRHS)->getDecl());
EmitOMPAggregateReduction(CGF, (*IPriv)->getType(), VD, RHSVar,
AtomicRedGen, XExpr, EExpr, UpExpr);
} else
// Emit atomic reduction for array subscript or single variable.
AtomicRedGen(CGF, XExpr, EExpr, UpExpr);
} else {
// Emit as a critical region.
auto &&CritRedGen = [E, Loc](CodeGenFunction &CGF, const Expr *,
const Expr *, const Expr *) {
auto &RT = CGF.CGM.getOpenMPRuntime();
RT.emitCriticalRegion(
CGF, ".atomic_reduction",
[=](CodeGenFunction &CGF, PrePostActionTy &Action) {
Action.Enter(CGF);
emitReductionCombiner(CGF, E);
},
Loc);
};
if ((*IPriv)->getType()->isArrayType()) {
auto *LHSVar = cast<VarDecl>(cast<DeclRefExpr>(*ILHS)->getDecl());
auto *RHSVar = cast<VarDecl>(cast<DeclRefExpr>(*IRHS)->getDecl());
EmitOMPAggregateReduction(CGF, (*IPriv)->getType(), LHSVar, RHSVar,
CritRedGen);
} else
CritRedGen(CGF, nullptr, nullptr, nullptr);
}
++ILHS;
++IRHS;
++IPriv;
}
};
RegionCodeGenTy AtomicRCG(AtomicCodeGen);
if (!WithNowait) {
// Add emission of __kmpc_end_reduce(<loc>, <gtid>, &<lock>);
llvm::Value *EndArgs[] = {
IdentTLoc, // ident_t *<loc>
ThreadId, // i32 <gtid>
Lock // kmp_critical_name *&<lock>
};
CommonActionTy Action(nullptr, llvm::None,
createRuntimeFunction(OMPRTL__kmpc_end_reduce),
EndArgs);
AtomicRCG.setAction(Action);
AtomicRCG(CGF);
} else
AtomicRCG(CGF);
CGF.EmitBranch(DefaultBB);
CGF.EmitBlock(DefaultBB, /*IsFinished=*/true);
}
void CGOpenMPRuntime::emitTaskwaitCall(CodeGenFunction &CGF,
SourceLocation Loc) {
if (!CGF.HaveInsertPoint())
return;
// Build call kmp_int32 __kmpc_omp_taskwait(ident_t *loc, kmp_int32
// global_tid);
llvm::Value *Args[] = {emitUpdateLocation(CGF, Loc), getThreadID(CGF, Loc)};
// Ignore return result until untied tasks are supported.
CGF.EmitRuntimeCall(createRuntimeFunction(OMPRTL__kmpc_omp_taskwait), Args);
if (auto *Region = dyn_cast_or_null<CGOpenMPRegionInfo>(CGF.CapturedStmtInfo))
Region->emitUntiedSwitch(CGF);
}
void CGOpenMPRuntime::emitInlinedDirective(CodeGenFunction &CGF,
OpenMPDirectiveKind InnerKind,
const RegionCodeGenTy &CodeGen,
bool HasCancel) {
if (!CGF.HaveInsertPoint())
return;
InlinedOpenMPRegionRAII Region(CGF, CodeGen, InnerKind, HasCancel);
CGF.CapturedStmtInfo->EmitBody(CGF, /*S=*/nullptr);
}
namespace {
enum RTCancelKind {
CancelNoreq = 0,
CancelParallel = 1,
CancelLoop = 2,
CancelSections = 3,
CancelTaskgroup = 4
};
} // anonymous namespace
static RTCancelKind getCancellationKind(OpenMPDirectiveKind CancelRegion) {
RTCancelKind CancelKind = CancelNoreq;
if (CancelRegion == OMPD_parallel)
CancelKind = CancelParallel;
else if (CancelRegion == OMPD_for)
CancelKind = CancelLoop;
else if (CancelRegion == OMPD_sections)
CancelKind = CancelSections;
else {
assert(CancelRegion == OMPD_taskgroup);
CancelKind = CancelTaskgroup;
}
return CancelKind;
}
void CGOpenMPRuntime::emitCancellationPointCall(
CodeGenFunction &CGF, SourceLocation Loc,
OpenMPDirectiveKind CancelRegion) {
if (!CGF.HaveInsertPoint())
return;
// Build call kmp_int32 __kmpc_cancellationpoint(ident_t *loc, kmp_int32
// global_tid, kmp_int32 cncl_kind);
if (auto *OMPRegionInfo =
dyn_cast_or_null<CGOpenMPRegionInfo>(CGF.CapturedStmtInfo)) {
- if (OMPRegionInfo->hasCancel()) {
+ // For 'cancellation point taskgroup', the task region info may not have a
+ // cancel. This may instead happen in another adjacent task.
+ if (CancelRegion == OMPD_taskgroup || OMPRegionInfo->hasCancel()) {
llvm::Value *Args[] = {
emitUpdateLocation(CGF, Loc), getThreadID(CGF, Loc),
CGF.Builder.getInt32(getCancellationKind(CancelRegion))};
// Ignore return result until untied tasks are supported.
auto *Result = CGF.EmitRuntimeCall(
createRuntimeFunction(OMPRTL__kmpc_cancellationpoint), Args);
// if (__kmpc_cancellationpoint()) {
// exit from construct;
// }
auto *ExitBB = CGF.createBasicBlock(".cancel.exit");
auto *ContBB = CGF.createBasicBlock(".cancel.continue");
auto *Cmp = CGF.Builder.CreateIsNotNull(Result);
CGF.Builder.CreateCondBr(Cmp, ExitBB, ContBB);
CGF.EmitBlock(ExitBB);
// exit from construct;
auto CancelDest =
CGF.getOMPCancelDestination(OMPRegionInfo->getDirectiveKind());
CGF.EmitBranchThroughCleanup(CancelDest);
CGF.EmitBlock(ContBB, /*IsFinished=*/true);
}
}
}
void CGOpenMPRuntime::emitCancelCall(CodeGenFunction &CGF, SourceLocation Loc,
const Expr *IfCond,
OpenMPDirectiveKind CancelRegion) {
if (!CGF.HaveInsertPoint())
return;
// Build call kmp_int32 __kmpc_cancel(ident_t *loc, kmp_int32 global_tid,
// kmp_int32 cncl_kind);
if (auto *OMPRegionInfo =
dyn_cast_or_null<CGOpenMPRegionInfo>(CGF.CapturedStmtInfo)) {
auto &&ThenGen = [Loc, CancelRegion, OMPRegionInfo](CodeGenFunction &CGF,
PrePostActionTy &) {
auto &RT = CGF.CGM.getOpenMPRuntime();
llvm::Value *Args[] = {
RT.emitUpdateLocation(CGF, Loc), RT.getThreadID(CGF, Loc),
CGF.Builder.getInt32(getCancellationKind(CancelRegion))};
// Ignore return result until untied tasks are supported.
auto *Result = CGF.EmitRuntimeCall(
RT.createRuntimeFunction(OMPRTL__kmpc_cancel), Args);
// if (__kmpc_cancel()) {
// exit from construct;
// }
auto *ExitBB = CGF.createBasicBlock(".cancel.exit");
auto *ContBB = CGF.createBasicBlock(".cancel.continue");
auto *Cmp = CGF.Builder.CreateIsNotNull(Result);
CGF.Builder.CreateCondBr(Cmp, ExitBB, ContBB);
CGF.EmitBlock(ExitBB);
// exit from construct;
auto CancelDest =
CGF.getOMPCancelDestination(OMPRegionInfo->getDirectiveKind());
CGF.EmitBranchThroughCleanup(CancelDest);
CGF.EmitBlock(ContBB, /*IsFinished=*/true);
};
if (IfCond)
emitOMPIfClause(CGF, IfCond, ThenGen,
[](CodeGenFunction &, PrePostActionTy &) {});
else {
RegionCodeGenTy ThenRCG(ThenGen);
ThenRCG(CGF);
}
}
}
/// \brief Obtain information that uniquely identifies a target entry. This
/// consists of the file and device IDs as well as line number associated with
/// the relevant entry source location.
static void getTargetEntryUniqueInfo(ASTContext &C, SourceLocation Loc,
unsigned &DeviceID, unsigned &FileID,
unsigned &LineNum) {
auto &SM = C.getSourceManager();
// The loc should be always valid and have a file ID (the user cannot use
// #pragma directives in macros)
assert(Loc.isValid() && "Source location is expected to be always valid.");
assert(Loc.isFileID() && "Source location is expected to refer to a file.");
PresumedLoc PLoc = SM.getPresumedLoc(Loc);
assert(PLoc.isValid() && "Source location is expected to be always valid.");
llvm::sys::fs::UniqueID ID;
if (llvm::sys::fs::getUniqueID(PLoc.getFilename(), ID))
llvm_unreachable("Source file with target region no longer exists!");
DeviceID = ID.getDevice();
FileID = ID.getFile();
LineNum = PLoc.getLine();
}
void CGOpenMPRuntime::emitTargetOutlinedFunction(
const OMPExecutableDirective &D, StringRef ParentName,
llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID,
bool IsOffloadEntry, const RegionCodeGenTy &CodeGen) {
assert(!ParentName.empty() && "Invalid target region parent name!");
emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
IsOffloadEntry, CodeGen);
}
void CGOpenMPRuntime::emitTargetOutlinedFunctionHelper(
const OMPExecutableDirective &D, StringRef ParentName,
llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID,
bool IsOffloadEntry, const RegionCodeGenTy &CodeGen) {
// Create a unique name for the entry function using the source location
// information of the current target region. The name will be something like:
//
// __omp_offloading_DD_FFFF_PP_lBB
//
// where DD_FFFF is an ID unique to the file (device and file IDs), PP is the
// mangled name of the function that encloses the target region and BB is the
// line number of the target region.
unsigned DeviceID;
unsigned FileID;
unsigned Line;
getTargetEntryUniqueInfo(CGM.getContext(), D.getLocStart(), DeviceID, FileID,
Line);
SmallString<64> EntryFnName;
{
llvm::raw_svector_ostream OS(EntryFnName);
OS << "__omp_offloading" << llvm::format("_%x", DeviceID)
<< llvm::format("_%x_", FileID) << ParentName << "_l" << Line;
}
const CapturedStmt &CS = *cast<CapturedStmt>(D.getAssociatedStmt());
CodeGenFunction CGF(CGM, true);
CGOpenMPTargetRegionInfo CGInfo(CS, CodeGen, EntryFnName);
CodeGenFunction::CGCapturedStmtRAII CapInfoRAII(CGF, &CGInfo);
OutlinedFn = CGF.GenerateOpenMPCapturedStmtFunction(CS);
// If this target outline function is not an offload entry, we don't need to
// register it.
if (!IsOffloadEntry)
return;
// The target region ID is used by the runtime library to identify the current
// target region, so it only has to be unique and not necessarily point to
// anything. It could be the pointer to the outlined function that implements
// the target region, but we aren't using that so that the compiler doesn't
// need to keep that, and could therefore inline the host function if proven
// worthwhile during optimization. In the other hand, if emitting code for the
// device, the ID has to be the function address so that it can retrieved from
// the offloading entry and launched by the runtime library. We also mark the
// outlined function to have external linkage in case we are emitting code for
// the device, because these functions will be entry points to the device.
if (CGM.getLangOpts().OpenMPIsDevice) {
OutlinedFnID = llvm::ConstantExpr::getBitCast(OutlinedFn, CGM.Int8PtrTy);
OutlinedFn->setLinkage(llvm::GlobalValue::ExternalLinkage);
} else
OutlinedFnID = new llvm::GlobalVariable(
CGM.getModule(), CGM.Int8Ty, /*isConstant=*/true,
llvm::GlobalValue::PrivateLinkage,
llvm::Constant::getNullValue(CGM.Int8Ty), ".omp_offload.region_id");
// Register the information for the entry associated with this target region.
OffloadEntriesInfoManager.registerTargetRegionEntryInfo(
DeviceID, FileID, ParentName, Line, OutlinedFn, OutlinedFnID,
/*Flags=*/0);
}
/// discard all CompoundStmts intervening between two constructs
static const Stmt *ignoreCompoundStmts(const Stmt *Body) {
while (auto *CS = dyn_cast_or_null<CompoundStmt>(Body))
Body = CS->body_front();
return Body;
}
/// \brief Emit the num_teams clause of an enclosed teams directive at the
/// target region scope. If there is no teams directive associated with the
/// target directive, or if there is no num_teams clause associated with the
/// enclosed teams directive, return nullptr.
static llvm::Value *
emitNumTeamsClauseForTargetDirective(CGOpenMPRuntime &OMPRuntime,
CodeGenFunction &CGF,
const OMPExecutableDirective &D) {
assert(!CGF.getLangOpts().OpenMPIsDevice && "Clauses associated with the "
"teams directive expected to be "
"emitted only for the host!");
// FIXME: For the moment we do not support combined directives with target and
// teams, so we do not expect to get any num_teams clause in the provided
// directive. Once we support that, this assertion can be replaced by the
// actual emission of the clause expression.
assert(D.getSingleClause<OMPNumTeamsClause>() == nullptr &&
"Not expecting clause in directive.");
// If the current target region has a teams region enclosed, we need to get
// the number of teams to pass to the runtime function call. This is done
// by generating the expression in a inlined region. This is required because
// the expression is captured in the enclosing target environment when the
// teams directive is not combined with target.
const CapturedStmt &CS = *cast<CapturedStmt>(D.getAssociatedStmt());
// FIXME: Accommodate other combined directives with teams when they become
// available.
if (auto *TeamsDir = dyn_cast_or_null<OMPTeamsDirective>(
ignoreCompoundStmts(CS.getCapturedStmt()))) {
if (auto *NTE = TeamsDir->getSingleClause<OMPNumTeamsClause>()) {
CGOpenMPInnerExprInfo CGInfo(CGF, CS);
CodeGenFunction::CGCapturedStmtRAII CapInfoRAII(CGF, &CGInfo);
llvm::Value *NumTeams = CGF.EmitScalarExpr(NTE->getNumTeams());
return CGF.Builder.CreateIntCast(NumTeams, CGF.Int32Ty,
/*IsSigned=*/true);
}
// If we have an enclosed teams directive but no num_teams clause we use
// the default value 0.
return CGF.Builder.getInt32(0);
}
// No teams associated with the directive.
return nullptr;
}
/// \brief Emit the thread_limit clause of an enclosed teams directive at the
/// target region scope. If there is no teams directive associated with the
/// target directive, or if there is no thread_limit clause associated with the
/// enclosed teams directive, return nullptr.
static llvm::Value *
emitThreadLimitClauseForTargetDirective(CGOpenMPRuntime &OMPRuntime,
CodeGenFunction &CGF,
const OMPExecutableDirective &D) {
assert(!CGF.getLangOpts().OpenMPIsDevice && "Clauses associated with the "
"teams directive expected to be "
"emitted only for the host!");
// FIXME: For the moment we do not support combined directives with target and
// teams, so we do not expect to get any thread_limit clause in the provided
// directive. Once we support that, this assertion can be replaced by the
// actual emission of the clause expression.
assert(D.getSingleClause<OMPThreadLimitClause>() == nullptr &&
"Not expecting clause in directive.");
// If the current target region has a teams region enclosed, we need to get
// the thread limit to pass to the runtime function call. This is done
// by generating the expression in a inlined region. This is required because
// the expression is captured in the enclosing target environment when the
// teams directive is not combined with target.
const CapturedStmt &CS = *cast<CapturedStmt>(D.getAssociatedStmt());
// FIXME: Accommodate other combined directives with teams when they become
// available.
if (auto *TeamsDir = dyn_cast_or_null<OMPTeamsDirective>(
ignoreCompoundStmts(CS.getCapturedStmt()))) {
if (auto *TLE = TeamsDir->getSingleClause<OMPThreadLimitClause>()) {
CGOpenMPInnerExprInfo CGInfo(CGF, CS);
CodeGenFunction::CGCapturedStmtRAII CapInfoRAII(CGF, &CGInfo);
llvm::Value *ThreadLimit = CGF.EmitScalarExpr(TLE->getThreadLimit());
return CGF.Builder.CreateIntCast(ThreadLimit, CGF.Int32Ty,
/*IsSigned=*/true);
}
// If we have an enclosed teams directive but no thread_limit clause we use
// the default value 0.
return CGF.Builder.getInt32(0);
}
// No teams associated with the directive.
return nullptr;
}
namespace {
// \brief Utility to handle information from clauses associated with a given
// construct that use mappable expressions (e.g. 'map' clause, 'to' clause).
// It provides a convenient interface to obtain the information and generate
// code for that information.
class MappableExprsHandler {
public:
/// \brief Values for bit flags used to specify the mapping type for
/// offloading.
enum OpenMPOffloadMappingFlags {
/// \brief Allocate memory on the device and move data from host to device.
OMP_MAP_TO = 0x01,
/// \brief Allocate memory on the device and move data from device to host.
OMP_MAP_FROM = 0x02,
/// \brief Always perform the requested mapping action on the element, even
/// if it was already mapped before.
OMP_MAP_ALWAYS = 0x04,
/// \brief Delete the element from the device environment, ignoring the
/// current reference count associated with the element.
OMP_MAP_DELETE = 0x08,
/// \brief The element being mapped is a pointer, therefore the pointee
/// should be mapped as well.
OMP_MAP_IS_PTR = 0x10,
/// \brief This flags signals that an argument is the first one relating to
/// a map/private clause expression. For some cases a single
/// map/privatization results in multiple arguments passed to the runtime
/// library.
OMP_MAP_FIRST_REF = 0x20,
/// \brief Signal that the runtime library has to return the device pointer
/// in the current position for the data being mapped.
OMP_MAP_RETURN_PTR = 0x40,
/// \brief This flag signals that the reference being passed is a pointer to
/// private data.
OMP_MAP_PRIVATE_PTR = 0x80,
/// \brief Pass the element to the device by value.
OMP_MAP_PRIVATE_VAL = 0x100,
};
/// Class that associates information with a base pointer to be passed to the
/// runtime library.
class BasePointerInfo {
/// The base pointer.
llvm::Value *Ptr = nullptr;
/// The base declaration that refers to this device pointer, or null if
/// there is none.
const ValueDecl *DevPtrDecl = nullptr;
public:
BasePointerInfo(llvm::Value *Ptr, const ValueDecl *DevPtrDecl = nullptr)
: Ptr(Ptr), DevPtrDecl(DevPtrDecl) {}
llvm::Value *operator*() const { return Ptr; }
const ValueDecl *getDevicePtrDecl() const { return DevPtrDecl; }
void setDevicePtrDecl(const ValueDecl *D) { DevPtrDecl = D; }
};
typedef SmallVector<BasePointerInfo, 16> MapBaseValuesArrayTy;
typedef SmallVector<llvm::Value *, 16> MapValuesArrayTy;
typedef SmallVector<unsigned, 16> MapFlagsArrayTy;
private:
/// \brief Directive from where the map clauses were extracted.
const OMPExecutableDirective &CurDir;
/// \brief Function the directive is being generated for.
CodeGenFunction &CGF;
/// \brief Set of all first private variables in the current directive.
llvm::SmallPtrSet<const VarDecl *, 8> FirstPrivateDecls;
/// Map between device pointer declarations and their expression components.
/// The key value for declarations in 'this' is null.
llvm::DenseMap<
const ValueDecl *,
SmallVector<OMPClauseMappableExprCommon::MappableExprComponentListRef, 4>>
DevPointersMap;
llvm::Value *getExprTypeSize(const Expr *E) const {
auto ExprTy = E->getType().getCanonicalType();
// Reference types are ignored for mapping purposes.
if (auto *RefTy = ExprTy->getAs<ReferenceType>())
ExprTy = RefTy->getPointeeType().getCanonicalType();
// Given that an array section is considered a built-in type, we need to
// do the calculation based on the length of the section instead of relying
// on CGF.getTypeSize(E->getType()).
if (const auto *OAE = dyn_cast<OMPArraySectionExpr>(E)) {
QualType BaseTy = OMPArraySectionExpr::getBaseOriginalType(
OAE->getBase()->IgnoreParenImpCasts())
.getCanonicalType();
// If there is no length associated with the expression, that means we
// are using the whole length of the base.
if (!OAE->getLength() && OAE->getColonLoc().isValid())
return CGF.getTypeSize(BaseTy);
llvm::Value *ElemSize;
if (auto *PTy = BaseTy->getAs<PointerType>())
ElemSize = CGF.getTypeSize(PTy->getPointeeType().getCanonicalType());
else {
auto *ATy = cast<ArrayType>(BaseTy.getTypePtr());
assert(ATy && "Expecting array type if not a pointer type.");
ElemSize = CGF.getTypeSize(ATy->getElementType().getCanonicalType());
}
// If we don't have a length at this point, that is because we have an
// array section with a single element.
if (!OAE->getLength())
return ElemSize;
auto *LengthVal = CGF.EmitScalarExpr(OAE->getLength());
LengthVal =
CGF.Builder.CreateIntCast(LengthVal, CGF.SizeTy, /*isSigned=*/false);
return CGF.Builder.CreateNUWMul(LengthVal, ElemSize);
}
return CGF.getTypeSize(ExprTy);
}
/// \brief Return the corresponding bits for a given map clause modifier. Add
/// a flag marking the map as a pointer if requested. Add a flag marking the
/// map as the first one of a series of maps that relate to the same map
/// expression.
unsigned getMapTypeBits(OpenMPMapClauseKind MapType,
OpenMPMapClauseKind MapTypeModifier, bool AddPtrFlag,
bool AddIsFirstFlag) const {
unsigned Bits = 0u;
switch (MapType) {
case OMPC_MAP_alloc:
case OMPC_MAP_release:
// alloc and release is the default behavior in the runtime library, i.e.
// if we don't pass any bits alloc/release that is what the runtime is
// going to do. Therefore, we don't need to signal anything for these two
// type modifiers.
break;
case OMPC_MAP_to:
Bits = OMP_MAP_TO;
break;
case OMPC_MAP_from:
Bits = OMP_MAP_FROM;
break;
case OMPC_MAP_tofrom:
Bits = OMP_MAP_TO | OMP_MAP_FROM;
break;
case OMPC_MAP_delete:
Bits = OMP_MAP_DELETE;
break;
default:
llvm_unreachable("Unexpected map type!");
break;
}
if (AddPtrFlag)
Bits |= OMP_MAP_IS_PTR;
if (AddIsFirstFlag)
Bits |= OMP_MAP_FIRST_REF;
if (MapTypeModifier == OMPC_MAP_always)
Bits |= OMP_MAP_ALWAYS;
return Bits;
}
/// \brief Return true if the provided expression is a final array section. A
/// final array section, is one whose length can't be proved to be one.
bool isFinalArraySectionExpression(const Expr *E) const {
auto *OASE = dyn_cast<OMPArraySectionExpr>(E);
// It is not an array section and therefore not a unity-size one.
if (!OASE)
return false;
// An array section with no colon always refer to a single element.
if (OASE->getColonLoc().isInvalid())
return false;
auto *Length = OASE->getLength();
// If we don't have a length we have to check if the array has size 1
// for this dimension. Also, we should always expect a length if the
// base type is pointer.
if (!Length) {
auto BaseQTy = OMPArraySectionExpr::getBaseOriginalType(
OASE->getBase()->IgnoreParenImpCasts())
.getCanonicalType();
if (auto *ATy = dyn_cast<ConstantArrayType>(BaseQTy.getTypePtr()))
return ATy->getSize().getSExtValue() != 1;
// If we don't have a constant dimension length, we have to consider
// the current section as having any size, so it is not necessarily
// unitary. If it happen to be unity size, that's user fault.
return true;
}
// Check if the length evaluates to 1.
llvm::APSInt ConstLength;
if (!Length->EvaluateAsInt(ConstLength, CGF.getContext()))
return true; // Can have more that size 1.
return ConstLength.getSExtValue() != 1;
}
/// \brief Generate the base pointers, section pointers, sizes and map type
/// bits for the provided map type, map modifier, and expression components.
/// \a IsFirstComponent should be set to true if the provided set of
/// components is the first associated with a capture.
void generateInfoForComponentList(
OpenMPMapClauseKind MapType, OpenMPMapClauseKind MapTypeModifier,
OMPClauseMappableExprCommon::MappableExprComponentListRef Components,
MapBaseValuesArrayTy &BasePointers, MapValuesArrayTy &Pointers,
MapValuesArrayTy &Sizes, MapFlagsArrayTy &Types,
bool IsFirstComponentList) const {
// The following summarizes what has to be generated for each map and the
// types bellow. The generated information is expressed in this order:
// base pointer, section pointer, size, flags
// (to add to the ones that come from the map type and modifier).
//
// double d;
// int i[100];
// float *p;
//
// struct S1 {
// int i;
// float f[50];
// }
// struct S2 {
// int i;
// float f[50];
// S1 s;
// double *p;
// struct S2 *ps;
// }
// S2 s;
// S2 *ps;
//
// map(d)
// &d, &d, sizeof(double), noflags
//
// map(i)
// &i, &i, 100*sizeof(int), noflags
//
// map(i[1:23])
// &i(=&i[0]), &i[1], 23*sizeof(int), noflags
//
// map(p)
// &p, &p, sizeof(float*), noflags
//
// map(p[1:24])
// p, &p[1], 24*sizeof(float), noflags
//
// map(s)
// &s, &s, sizeof(S2), noflags
//
// map(s.i)
// &s, &(s.i), sizeof(int), noflags
//
// map(s.s.f)
// &s, &(s.i.f), 50*sizeof(int), noflags
//
// map(s.p)
// &s, &(s.p), sizeof(double*), noflags
//
// map(s.p[:22], s.a s.b)
// &s, &(s.p), sizeof(double*), noflags
// &(s.p), &(s.p[0]), 22*sizeof(double), ptr_flag + extra_flag
//
// map(s.ps)
// &s, &(s.ps), sizeof(S2*), noflags
//
// map(s.ps->s.i)
// &s, &(s.ps), sizeof(S2*), noflags
// &(s.ps), &(s.ps->s.i), sizeof(int), ptr_flag + extra_flag
//
// map(s.ps->ps)
// &s, &(s.ps), sizeof(S2*), noflags
// &(s.ps), &(s.ps->ps), sizeof(S2*), ptr_flag + extra_flag
//
// map(s.ps->ps->ps)
// &s, &(s.ps), sizeof(S2*), noflags
// &(s.ps), &(s.ps->ps), sizeof(S2*), ptr_flag + extra_flag
// &(s.ps->ps), &(s.ps->ps->ps), sizeof(S2*), ptr_flag + extra_flag
//
// map(s.ps->ps->s.f[:22])
// &s, &(s.ps), sizeof(S2*), noflags
// &(s.ps), &(s.ps->ps), sizeof(S2*), ptr_flag + extra_flag
// &(s.ps->ps), &(s.ps->ps->s.f[0]), 22*sizeof(float), ptr_flag + extra_flag
//
// map(ps)
// &ps, &ps, sizeof(S2*), noflags
//
// map(ps->i)
// ps, &(ps->i), sizeof(int), noflags
//
// map(ps->s.f)
// ps, &(ps->s.f[0]), 50*sizeof(float), noflags
//
// map(ps->p)
// ps, &(ps->p), sizeof(double*), noflags
//
// map(ps->p[:22])
// ps, &(ps->p), sizeof(double*), noflags
// &(ps->p), &(ps->p[0]), 22*sizeof(double), ptr_flag + extra_flag
//
// map(ps->ps)
// ps, &(ps->ps), sizeof(S2*), noflags
//
// map(ps->ps->s.i)
// ps, &(ps->ps), sizeof(S2*), noflags
// &(ps->ps), &(ps->ps->s.i), sizeof(int), ptr_flag + extra_flag
//
// map(ps->ps->ps)
// ps, &(ps->ps), sizeof(S2*), noflags
// &(ps->ps), &(ps->ps->ps), sizeof(S2*), ptr_flag + extra_flag
//
// map(ps->ps->ps->ps)
// ps, &(ps->ps), sizeof(S2*), noflags
// &(ps->ps), &(ps->ps->ps), sizeof(S2*), ptr_flag + extra_flag
// &(ps->ps->ps), &(ps->ps->ps->ps), sizeof(S2*), ptr_flag + extra_flag
//
// map(ps->ps->ps->s.f[:22])
// ps, &(ps->ps), sizeof(S2*), noflags
// &(ps->ps), &(ps->ps->ps), sizeof(S2*), ptr_flag + extra_flag
// &(ps->ps->ps), &(ps->ps->ps->s.f[0]), 22*sizeof(float), ptr_flag +
// extra_flag
// Track if the map information being generated is the first for a capture.
bool IsCaptureFirstInfo = IsFirstComponentList;
// Scan the components from the base to the complete expression.
auto CI = Components.rbegin();
auto CE = Components.rend();
auto I = CI;
// Track if the map information being generated is the first for a list of
// components.
bool IsExpressionFirstInfo = true;
llvm::Value *BP = nullptr;
if (auto *ME = dyn_cast<MemberExpr>(I->getAssociatedExpression())) {
// The base is the 'this' pointer. The content of the pointer is going
// to be the base of the field being mapped.
BP = CGF.EmitScalarExpr(ME->getBase());
} else {
// The base is the reference to the variable.
// BP = &Var.
BP = CGF.EmitLValue(cast<DeclRefExpr>(I->getAssociatedExpression()))
.getPointer();
// If the variable is a pointer and is being dereferenced (i.e. is not
// the last component), the base has to be the pointer itself, not its
// reference. References are ignored for mapping purposes.
QualType Ty =
I->getAssociatedDeclaration()->getType().getNonReferenceType();
if (Ty->isAnyPointerType() && std::next(I) != CE) {
auto PtrAddr = CGF.MakeNaturalAlignAddrLValue(BP, Ty);
BP = CGF.EmitLoadOfPointerLValue(PtrAddr.getAddress(),
Ty->castAs<PointerType>())
.getPointer();
// We do not need to generate individual map information for the
// pointer, it can be associated with the combined storage.
++I;
}
}
for (; I != CE; ++I) {
auto Next = std::next(I);
// We need to generate the addresses and sizes if this is the last
// component, if the component is a pointer or if it is an array section
// whose length can't be proved to be one. If this is a pointer, it
// becomes the base address for the following components.
// A final array section, is one whose length can't be proved to be one.
bool IsFinalArraySection =
isFinalArraySectionExpression(I->getAssociatedExpression());
// Get information on whether the element is a pointer. Have to do a
// special treatment for array sections given that they are built-in
// types.
const auto *OASE =
dyn_cast<OMPArraySectionExpr>(I->getAssociatedExpression());
bool IsPointer =
(OASE &&
OMPArraySectionExpr::getBaseOriginalType(OASE)
.getCanonicalType()
->isAnyPointerType()) ||
I->getAssociatedExpression()->getType()->isAnyPointerType();
if (Next == CE || IsPointer || IsFinalArraySection) {
// If this is not the last component, we expect the pointer to be
// associated with an array expression or member expression.
assert((Next == CE ||
isa<MemberExpr>(Next->getAssociatedExpression()) ||
isa<ArraySubscriptExpr>(Next->getAssociatedExpression()) ||
isa<OMPArraySectionExpr>(Next->getAssociatedExpression())) &&
"Unexpected expression");
auto *LB = CGF.EmitLValue(I->getAssociatedExpression()).getPointer();
auto *Size = getExprTypeSize(I->getAssociatedExpression());
// If we have a member expression and the current component is a
// reference, we have to map the reference too. Whenever we have a
// reference, the section that reference refers to is going to be a
// load instruction from the storage assigned to the reference.
if (isa<MemberExpr>(I->getAssociatedExpression()) &&
I->getAssociatedDeclaration()->getType()->isReferenceType()) {
auto *LI = cast<llvm::LoadInst>(LB);
auto *RefAddr = LI->getPointerOperand();
BasePointers.push_back(BP);
Pointers.push_back(RefAddr);
Sizes.push_back(CGF.getTypeSize(CGF.getContext().VoidPtrTy));
Types.push_back(getMapTypeBits(
/*MapType*/ OMPC_MAP_alloc, /*MapTypeModifier=*/OMPC_MAP_unknown,
!IsExpressionFirstInfo, IsCaptureFirstInfo));
IsExpressionFirstInfo = false;
IsCaptureFirstInfo = false;
// The reference will be the next base address.
BP = RefAddr;
}
BasePointers.push_back(BP);
Pointers.push_back(LB);
Sizes.push_back(Size);
// We need to add a pointer flag for each map that comes from the
// same expression except for the first one. We also need to signal
// this map is the first one that relates with the current capture
// (there is a set of entries for each capture).
Types.push_back(getMapTypeBits(MapType, MapTypeModifier,
!IsExpressionFirstInfo,
IsCaptureFirstInfo));
// If we have a final array section, we are done with this expression.
if (IsFinalArraySection)
break;
// The pointer becomes the base for the next element.
if (Next != CE)
BP = LB;
IsExpressionFirstInfo = false;
IsCaptureFirstInfo = false;
continue;
}
}
}
/// \brief Return the adjusted map modifiers if the declaration a capture
/// refers to appears in a first-private clause. This is expected to be used
/// only with directives that start with 'target'.
unsigned adjustMapModifiersForPrivateClauses(const CapturedStmt::Capture &Cap,
unsigned CurrentModifiers) {
assert(Cap.capturesVariable() && "Expected capture by reference only!");
// A first private variable captured by reference will use only the
// 'private ptr' and 'map to' flag. Return the right flags if the captured
// declaration is known as first-private in this handler.
if (FirstPrivateDecls.count(Cap.getCapturedVar()))
return MappableExprsHandler::OMP_MAP_PRIVATE_PTR |
MappableExprsHandler::OMP_MAP_TO;
// We didn't modify anything.
return CurrentModifiers;
}
public:
MappableExprsHandler(const OMPExecutableDirective &Dir, CodeGenFunction &CGF)
: CurDir(Dir), CGF(CGF) {
// Extract firstprivate clause information.
for (const auto *C : Dir.getClausesOfKind<OMPFirstprivateClause>())
for (const auto *D : C->varlists())
FirstPrivateDecls.insert(
cast<VarDecl>(cast<DeclRefExpr>(D)->getDecl())->getCanonicalDecl());
// Extract device pointer clause information.
for (const auto *C : Dir.getClausesOfKind<OMPIsDevicePtrClause>())
for (auto L : C->component_lists())
DevPointersMap[L.first].push_back(L.second);
}
/// \brief Generate all the base pointers, section pointers, sizes and map
/// types for the extracted mappable expressions. Also, for each item that
/// relates with a device pointer, a pair of the relevant declaration and
/// index where it occurs is appended to the device pointers info array.
void generateAllInfo(MapBaseValuesArrayTy &BasePointers,
MapValuesArrayTy &Pointers, MapValuesArrayTy &Sizes,
MapFlagsArrayTy &Types) const {
BasePointers.clear();
Pointers.clear();
Sizes.clear();
Types.clear();
struct MapInfo {
/// Kind that defines how a device pointer has to be returned.
enum ReturnPointerKind {
// Don't have to return any pointer.
RPK_None,
// Pointer is the base of the declaration.
RPK_Base,
// Pointer is a member of the base declaration - 'this'
RPK_Member,
// Pointer is a reference and a member of the base declaration - 'this'
RPK_MemberReference,
};
OMPClauseMappableExprCommon::MappableExprComponentListRef Components;
OpenMPMapClauseKind MapType;
OpenMPMapClauseKind MapTypeModifier;
ReturnPointerKind ReturnDevicePointer;
MapInfo()
: MapType(OMPC_MAP_unknown), MapTypeModifier(OMPC_MAP_unknown),
ReturnDevicePointer(RPK_None) {}
MapInfo(
OMPClauseMappableExprCommon::MappableExprComponentListRef Components,
OpenMPMapClauseKind MapType, OpenMPMapClauseKind MapTypeModifier,
ReturnPointerKind ReturnDevicePointer)
: Components(Components), MapType(MapType),
MapTypeModifier(MapTypeModifier),
ReturnDevicePointer(ReturnDevicePointer) {}
};
// We have to process the component lists that relate with the same
// declaration in a single chunk so that we can generate the map flags
// correctly. Therefore, we organize all lists in a map.
llvm::DenseMap<const ValueDecl *, SmallVector<MapInfo, 8>> Info;
// Helper function to fill the information map for the different supported
// clauses.
auto &&InfoGen = [&Info](
const ValueDecl *D,
OMPClauseMappableExprCommon::MappableExprComponentListRef L,
OpenMPMapClauseKind MapType, OpenMPMapClauseKind MapModifier,
MapInfo::ReturnPointerKind ReturnDevicePointer) {
const ValueDecl *VD =
D ? cast<ValueDecl>(D->getCanonicalDecl()) : nullptr;
Info[VD].push_back({L, MapType, MapModifier, ReturnDevicePointer});
};
// FIXME: MSVC 2013 seems to require this-> to find member CurDir.
for (auto *C : this->CurDir.getClausesOfKind<OMPMapClause>())
for (auto L : C->component_lists())
InfoGen(L.first, L.second, C->getMapType(), C->getMapTypeModifier(),
MapInfo::RPK_None);
for (auto *C : this->CurDir.getClausesOfKind<OMPToClause>())
for (auto L : C->component_lists())
InfoGen(L.first, L.second, OMPC_MAP_to, OMPC_MAP_unknown,
MapInfo::RPK_None);
for (auto *C : this->CurDir.getClausesOfKind<OMPFromClause>())
for (auto L : C->component_lists())
InfoGen(L.first, L.second, OMPC_MAP_from, OMPC_MAP_unknown,
MapInfo::RPK_None);
// Look at the use_device_ptr clause information and mark the existing map
// entries as such. If there is no map information for an entry in the
// use_device_ptr list, we create one with map type 'alloc' and zero size
// section. It is the user fault if that was not mapped before.
// FIXME: MSVC 2013 seems to require this-> to find member CurDir.
for (auto *C : this->CurDir.getClausesOfKind<OMPUseDevicePtrClause>())
for (auto L : C->component_lists()) {
assert(!L.second.empty() && "Not expecting empty list of components!");
const ValueDecl *VD = L.second.back().getAssociatedDeclaration();
VD = cast<ValueDecl>(VD->getCanonicalDecl());
auto *IE = L.second.back().getAssociatedExpression();
// If the first component is a member expression, we have to look into
// 'this', which maps to null in the map of map information. Otherwise
// look directly for the information.
auto It = Info.find(isa<MemberExpr>(IE) ? nullptr : VD);
// We potentially have map information for this declaration already.
// Look for the first set of components that refer to it.
if (It != Info.end()) {
auto CI = std::find_if(
It->second.begin(), It->second.end(), [VD](const MapInfo &MI) {
return MI.Components.back().getAssociatedDeclaration() == VD;
});
// If we found a map entry, signal that the pointer has to be returned
// and move on to the next declaration.
if (CI != It->second.end()) {
CI->ReturnDevicePointer = isa<MemberExpr>(IE)
? (VD->getType()->isReferenceType()
? MapInfo::RPK_MemberReference
: MapInfo::RPK_Member)
: MapInfo::RPK_Base;
continue;
}
}
// We didn't find any match in our map information - generate a zero
// size array section.
// FIXME: MSVC 2013 seems to require this-> to find member CGF.
llvm::Value *Ptr =
this->CGF
.EmitLoadOfLValue(this->CGF.EmitLValue(IE), SourceLocation())
.getScalarVal();
BasePointers.push_back({Ptr, VD});
Pointers.push_back(Ptr);
Sizes.push_back(llvm::Constant::getNullValue(this->CGF.SizeTy));
Types.push_back(OMP_MAP_RETURN_PTR | OMP_MAP_FIRST_REF);
}
for (auto &M : Info) {
// We need to know when we generate information for the first component
// associated with a capture, because the mapping flags depend on it.
bool IsFirstComponentList = true;
for (MapInfo &L : M.second) {
assert(!L.Components.empty() &&
"Not expecting declaration with no component lists.");
// Remember the current base pointer index.
unsigned CurrentBasePointersIdx = BasePointers.size();
// FIXME: MSVC 2013 seems to require this-> to find the member method.
this->generateInfoForComponentList(L.MapType, L.MapTypeModifier,
L.Components, BasePointers, Pointers,
Sizes, Types, IsFirstComponentList);
// If this entry relates with a device pointer, set the relevant
// declaration and add the 'return pointer' flag.
if (IsFirstComponentList &&
L.ReturnDevicePointer != MapInfo::RPK_None) {
// If the pointer is not the base of the map, we need to skip the
// base. If it is a reference in a member field, we also need to skip
// the map of the reference.
if (L.ReturnDevicePointer != MapInfo::RPK_Base) {
++CurrentBasePointersIdx;
if (L.ReturnDevicePointer == MapInfo::RPK_MemberReference)
++CurrentBasePointersIdx;
}
assert(BasePointers.size() > CurrentBasePointersIdx &&
"Unexpected number of mapped base pointers.");
auto *RelevantVD = L.Components.back().getAssociatedDeclaration();
assert(RelevantVD &&
"No relevant declaration related with device pointer??");
BasePointers[CurrentBasePointersIdx].setDevicePtrDecl(RelevantVD);
Types[CurrentBasePointersIdx] |= OMP_MAP_RETURN_PTR;
}
IsFirstComponentList = false;
}
}
}
/// \brief Generate the base pointers, section pointers, sizes and map types
/// associated to a given capture.
void generateInfoForCapture(const CapturedStmt::Capture *Cap,
llvm::Value *Arg,
MapBaseValuesArrayTy &BasePointers,
MapValuesArrayTy &Pointers,
MapValuesArrayTy &Sizes,
MapFlagsArrayTy &Types) const {
assert(!Cap->capturesVariableArrayType() &&
"Not expecting to generate map info for a variable array type!");
BasePointers.clear();
Pointers.clear();
Sizes.clear();
Types.clear();
// We need to know when we generating information for the first component
// associated with a capture, because the mapping flags depend on it.
bool IsFirstComponentList = true;
const ValueDecl *VD =
Cap->capturesThis()
? nullptr
: cast<ValueDecl>(Cap->getCapturedVar()->getCanonicalDecl());
// If this declaration appears in a is_device_ptr clause we just have to
// pass the pointer by value. If it is a reference to a declaration, we just
// pass its value, otherwise, if it is a member expression, we need to map
// 'to' the field.
if (!VD) {
auto It = DevPointersMap.find(VD);
if (It != DevPointersMap.end()) {
for (auto L : It->second) {
generateInfoForComponentList(
/*MapType=*/OMPC_MAP_to, /*MapTypeModifier=*/OMPC_MAP_unknown, L,
BasePointers, Pointers, Sizes, Types, IsFirstComponentList);
IsFirstComponentList = false;
}
return;
}
} else if (DevPointersMap.count(VD)) {
BasePointers.push_back({Arg, VD});
Pointers.push_back(Arg);
Sizes.push_back(CGF.getTypeSize(CGF.getContext().VoidPtrTy));
Types.push_back(OMP_MAP_PRIVATE_VAL | OMP_MAP_FIRST_REF);
return;
}
// FIXME: MSVC 2013 seems to require this-> to find member CurDir.
for (auto *C : this->CurDir.getClausesOfKind<OMPMapClause>())
for (auto L : C->decl_component_lists(VD)) {
assert(L.first == VD &&
"We got information for the wrong declaration??");
assert(!L.second.empty() &&
"Not expecting declaration with no component lists.");
generateInfoForComponentList(C->getMapType(), C->getMapTypeModifier(),
L.second, BasePointers, Pointers, Sizes,
Types, IsFirstComponentList);
IsFirstComponentList = false;
}
return;
}
/// \brief Generate the default map information for a given capture \a CI,
/// record field declaration \a RI and captured value \a CV.
void generateDefaultMapInfo(const CapturedStmt::Capture &CI,
const FieldDecl &RI, llvm::Value *CV,
MapBaseValuesArrayTy &CurBasePointers,
MapValuesArrayTy &CurPointers,
MapValuesArrayTy &CurSizes,
MapFlagsArrayTy &CurMapTypes) {
// Do the default mapping.
if (CI.capturesThis()) {
CurBasePointers.push_back(CV);
CurPointers.push_back(CV);
const PointerType *PtrTy = cast<PointerType>(RI.getType().getTypePtr());
CurSizes.push_back(CGF.getTypeSize(PtrTy->getPointeeType()));
// Default map type.
CurMapTypes.push_back(OMP_MAP_TO | OMP_MAP_FROM);
} else if (CI.capturesVariableByCopy()) {
CurBasePointers.push_back(CV);
CurPointers.push_back(CV);
if (!RI.getType()->isAnyPointerType()) {
// We have to signal to the runtime captures passed by value that are
// not pointers.
CurMapTypes.push_back(OMP_MAP_PRIVATE_VAL);
CurSizes.push_back(CGF.getTypeSize(RI.getType()));
} else {
// Pointers are implicitly mapped with a zero size and no flags
// (other than first map that is added for all implicit maps).
CurMapTypes.push_back(0u);
CurSizes.push_back(llvm::Constant::getNullValue(CGF.SizeTy));
}
} else {
assert(CI.capturesVariable() && "Expected captured reference.");
CurBasePointers.push_back(CV);
CurPointers.push_back(CV);
const ReferenceType *PtrTy =
cast<ReferenceType>(RI.getType().getTypePtr());
QualType ElementType = PtrTy->getPointeeType();
CurSizes.push_back(CGF.getTypeSize(ElementType));
// The default map type for a scalar/complex type is 'to' because by
// default the value doesn't have to be retrieved. For an aggregate
// type, the default is 'tofrom'.
CurMapTypes.push_back(ElementType->isAggregateType()
? (OMP_MAP_TO | OMP_MAP_FROM)
: OMP_MAP_TO);
// If we have a capture by reference we may need to add the private
// pointer flag if the base declaration shows in some first-private
// clause.
CurMapTypes.back() =
adjustMapModifiersForPrivateClauses(CI, CurMapTypes.back());
}
// Every default map produces a single argument, so, it is always the
// first one.
CurMapTypes.back() |= OMP_MAP_FIRST_REF;
}
};
enum OpenMPOffloadingReservedDeviceIDs {
/// \brief Device ID if the device was not defined, runtime should get it
/// from environment variables in the spec.
OMP_DEVICEID_UNDEF = -1,
};
} // anonymous namespace
/// \brief Emit the arrays used to pass the captures and map information to the
/// offloading runtime library. If there is no map or capture information,
/// return nullptr by reference.
static void
emitOffloadingArrays(CodeGenFunction &CGF,
MappableExprsHandler::MapBaseValuesArrayTy &BasePointers,
MappableExprsHandler::MapValuesArrayTy &Pointers,
MappableExprsHandler::MapValuesArrayTy &Sizes,
MappableExprsHandler::MapFlagsArrayTy &MapTypes,
CGOpenMPRuntime::TargetDataInfo &Info) {
auto &CGM = CGF.CGM;
auto &Ctx = CGF.getContext();
// Reset the array information.
Info.clearArrayInfo();
Info.NumberOfPtrs = BasePointers.size();
if (Info.NumberOfPtrs) {
// Detect if we have any capture size requiring runtime evaluation of the
// size so that a constant array could be eventually used.
bool hasRuntimeEvaluationCaptureSize = false;
for (auto *S : Sizes)
if (!isa<llvm::Constant>(S)) {
hasRuntimeEvaluationCaptureSize = true;
break;
}
llvm::APInt PointerNumAP(32, Info.NumberOfPtrs, /*isSigned=*/true);
QualType PointerArrayType =
Ctx.getConstantArrayType(Ctx.VoidPtrTy, PointerNumAP, ArrayType::Normal,
/*IndexTypeQuals=*/0);
Info.BasePointersArray =
CGF.CreateMemTemp(PointerArrayType, ".offload_baseptrs").getPointer();
Info.PointersArray =
CGF.CreateMemTemp(PointerArrayType, ".offload_ptrs").getPointer();
// If we don't have any VLA types or other types that require runtime
// evaluation, we can use a constant array for the map sizes, otherwise we
// need to fill up the arrays as we do for the pointers.
if (hasRuntimeEvaluationCaptureSize) {
QualType SizeArrayType = Ctx.getConstantArrayType(
Ctx.getSizeType(), PointerNumAP, ArrayType::Normal,
/*IndexTypeQuals=*/0);
Info.SizesArray =
CGF.CreateMemTemp(SizeArrayType, ".offload_sizes").getPointer();
} else {
// We expect all the sizes to be constant, so we collect them to create
// a constant array.
SmallVector<llvm::Constant *, 16> ConstSizes;
for (auto S : Sizes)
ConstSizes.push_back(cast<llvm::Constant>(S));
auto *SizesArrayInit = llvm::ConstantArray::get(
llvm::ArrayType::get(CGM.SizeTy, ConstSizes.size()), ConstSizes);
auto *SizesArrayGbl = new llvm::GlobalVariable(
CGM.getModule(), SizesArrayInit->getType(),
/*isConstant=*/true, llvm::GlobalValue::PrivateLinkage,
SizesArrayInit, ".offload_sizes");
SizesArrayGbl->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global);
Info.SizesArray = SizesArrayGbl;
}
// The map types are always constant so we don't need to generate code to
// fill arrays. Instead, we create an array constant.
llvm::Constant *MapTypesArrayInit =
llvm::ConstantDataArray::get(CGF.Builder.getContext(), MapTypes);
auto *MapTypesArrayGbl = new llvm::GlobalVariable(
CGM.getModule(), MapTypesArrayInit->getType(),
/*isConstant=*/true, llvm::GlobalValue::PrivateLinkage,
MapTypesArrayInit, ".offload_maptypes");
MapTypesArrayGbl->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global);
Info.MapTypesArray = MapTypesArrayGbl;
for (unsigned i = 0; i < Info.NumberOfPtrs; ++i) {
llvm::Value *BPVal = *BasePointers[i];
if (BPVal->getType()->isPointerTy())
BPVal = CGF.Builder.CreateBitCast(BPVal, CGM.VoidPtrTy);
else {
assert(BPVal->getType()->isIntegerTy() &&
"If not a pointer, the value type must be an integer.");
BPVal = CGF.Builder.CreateIntToPtr(BPVal, CGM.VoidPtrTy);
}
llvm::Value *BP = CGF.Builder.CreateConstInBoundsGEP2_32(
llvm::ArrayType::get(CGM.VoidPtrTy, Info.NumberOfPtrs),
Info.BasePointersArray, 0, i);
Address BPAddr(BP, Ctx.getTypeAlignInChars(Ctx.VoidPtrTy));
CGF.Builder.CreateStore(BPVal, BPAddr);
if (Info.requiresDevicePointerInfo())
if (auto *DevVD = BasePointers[i].getDevicePtrDecl())
Info.CaptureDeviceAddrMap.insert(std::make_pair(DevVD, BPAddr));
llvm::Value *PVal = Pointers[i];
if (PVal->getType()->isPointerTy())
PVal = CGF.Builder.CreateBitCast(PVal, CGM.VoidPtrTy);
else {
assert(PVal->getType()->isIntegerTy() &&
"If not a pointer, the value type must be an integer.");
PVal = CGF.Builder.CreateIntToPtr(PVal, CGM.VoidPtrTy);
}
llvm::Value *P = CGF.Builder.CreateConstInBoundsGEP2_32(
llvm::ArrayType::get(CGM.VoidPtrTy, Info.NumberOfPtrs),
Info.PointersArray, 0, i);
Address PAddr(P, Ctx.getTypeAlignInChars(Ctx.VoidPtrTy));
CGF.Builder.CreateStore(PVal, PAddr);
if (hasRuntimeEvaluationCaptureSize) {
llvm::Value *S = CGF.Builder.CreateConstInBoundsGEP2_32(
llvm::ArrayType::get(CGM.SizeTy, Info.NumberOfPtrs),
Info.SizesArray,
/*Idx0=*/0,
/*Idx1=*/i);
Address SAddr(S, Ctx.getTypeAlignInChars(Ctx.getSizeType()));
CGF.Builder.CreateStore(
CGF.Builder.CreateIntCast(Sizes[i], CGM.SizeTy, /*isSigned=*/true),
SAddr);
}
}
}
}
/// \brief Emit the arguments to be passed to the runtime library based on the
/// arrays of pointers, sizes and map types.
static void emitOffloadingArraysArgument(
CodeGenFunction &CGF, llvm::Value *&BasePointersArrayArg,
llvm::Value *&PointersArrayArg, llvm::Value *&SizesArrayArg,
llvm::Value *&MapTypesArrayArg, CGOpenMPRuntime::TargetDataInfo &Info) {
auto &CGM = CGF.CGM;
if (Info.NumberOfPtrs) {
BasePointersArrayArg = CGF.Builder.CreateConstInBoundsGEP2_32(
llvm::ArrayType::get(CGM.VoidPtrTy, Info.NumberOfPtrs),
Info.BasePointersArray,
/*Idx0=*/0, /*Idx1=*/0);
PointersArrayArg = CGF.Builder.CreateConstInBoundsGEP2_32(
llvm::ArrayType::get(CGM.VoidPtrTy, Info.NumberOfPtrs),
Info.PointersArray,
/*Idx0=*/0,
/*Idx1=*/0);
SizesArrayArg = CGF.Builder.CreateConstInBoundsGEP2_32(
llvm::ArrayType::get(CGM.SizeTy, Info.NumberOfPtrs), Info.SizesArray,
/*Idx0=*/0, /*Idx1=*/0);
MapTypesArrayArg = CGF.Builder.CreateConstInBoundsGEP2_32(
llvm::ArrayType::get(CGM.Int32Ty, Info.NumberOfPtrs),
Info.MapTypesArray,
/*Idx0=*/0,
/*Idx1=*/0);
} else {
BasePointersArrayArg = llvm::ConstantPointerNull::get(CGM.VoidPtrPtrTy);
PointersArrayArg = llvm::ConstantPointerNull::get(CGM.VoidPtrPtrTy);
SizesArrayArg = llvm::ConstantPointerNull::get(CGM.SizeTy->getPointerTo());
MapTypesArrayArg =
llvm::ConstantPointerNull::get(CGM.Int32Ty->getPointerTo());
}
}
void CGOpenMPRuntime::emitTargetCall(CodeGenFunction &CGF,
const OMPExecutableDirective &D,
llvm::Value *OutlinedFn,
llvm::Value *OutlinedFnID,
const Expr *IfCond, const Expr *Device,
ArrayRef<llvm::Value *> CapturedVars) {
if (!CGF.HaveInsertPoint())
return;
assert(OutlinedFn && "Invalid outlined function!");
auto &Ctx = CGF.getContext();
// Fill up the arrays with all the captured variables.
MappableExprsHandler::MapValuesArrayTy KernelArgs;
MappableExprsHandler::MapBaseValuesArrayTy BasePointers;
MappableExprsHandler::MapValuesArrayTy Pointers;
MappableExprsHandler::MapValuesArrayTy Sizes;
MappableExprsHandler::MapFlagsArrayTy MapTypes;
MappableExprsHandler::MapBaseValuesArrayTy CurBasePointers;
MappableExprsHandler::MapValuesArrayTy CurPointers;
MappableExprsHandler::MapValuesArrayTy CurSizes;
MappableExprsHandler::MapFlagsArrayTy CurMapTypes;
// Get mappable expression information.
MappableExprsHandler MEHandler(D, CGF);
const CapturedStmt &CS = *cast<CapturedStmt>(D.getAssociatedStmt());
auto RI = CS.getCapturedRecordDecl()->field_begin();
auto CV = CapturedVars.begin();
for (CapturedStmt::const_capture_iterator CI = CS.capture_begin(),
CE = CS.capture_end();
CI != CE; ++CI, ++RI, ++CV) {
StringRef Name;
QualType Ty;
CurBasePointers.clear();
CurPointers.clear();
CurSizes.clear();
CurMapTypes.clear();
// VLA sizes are passed to the outlined region by copy and do not have map
// information associated.
if (CI->capturesVariableArrayType()) {
CurBasePointers.push_back(*CV);
CurPointers.push_back(*CV);
CurSizes.push_back(CGF.getTypeSize(RI->getType()));
// Copy to the device as an argument. No need to retrieve it.
CurMapTypes.push_back(MappableExprsHandler::OMP_MAP_PRIVATE_VAL |
MappableExprsHandler::OMP_MAP_FIRST_REF);
} else {
// If we have any information in the map clause, we use it, otherwise we
// just do a default mapping.
MEHandler.generateInfoForCapture(CI, *CV, CurBasePointers, CurPointers,
CurSizes, CurMapTypes);
if (CurBasePointers.empty())
MEHandler.generateDefaultMapInfo(*CI, **RI, *CV, CurBasePointers,
CurPointers, CurSizes, CurMapTypes);
}
// We expect to have at least an element of information for this capture.
assert(!CurBasePointers.empty() && "Non-existing map pointer for capture!");
assert(CurBasePointers.size() == CurPointers.size() &&
CurBasePointers.size() == CurSizes.size() &&
CurBasePointers.size() == CurMapTypes.size() &&
"Inconsistent map information sizes!");
// The kernel args are always the first elements of the base pointers
// associated with a capture.
KernelArgs.push_back(*CurBasePointers.front());
// We need to append the results of this capture to what we already have.
BasePointers.append(CurBasePointers.begin(), CurBasePointers.end());
Pointers.append(CurPointers.begin(), CurPointers.end());
Sizes.append(CurSizes.begin(), CurSizes.end());
MapTypes.append(CurMapTypes.begin(), CurMapTypes.end());
}
// Keep track on whether the host function has to be executed.
auto OffloadErrorQType =
Ctx.getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/true);
auto OffloadError = CGF.MakeAddrLValue(
CGF.CreateMemTemp(OffloadErrorQType, ".run_host_version"),
OffloadErrorQType);
CGF.EmitStoreOfScalar(llvm::Constant::getNullValue(CGM.Int32Ty),
OffloadError);
// Fill up the pointer arrays and transfer execution to the device.
auto &&ThenGen = [&Ctx, &BasePointers, &Pointers, &Sizes, &MapTypes, Device,
OutlinedFnID, OffloadError, OffloadErrorQType,
&D](CodeGenFunction &CGF, PrePostActionTy &) {
auto &RT = CGF.CGM.getOpenMPRuntime();
// Emit the offloading arrays.
TargetDataInfo Info;
emitOffloadingArrays(CGF, BasePointers, Pointers, Sizes, MapTypes, Info);
emitOffloadingArraysArgument(CGF, Info.BasePointersArray,
Info.PointersArray, Info.SizesArray,
Info.MapTypesArray, Info);
// On top of the arrays that were filled up, the target offloading call
// takes as arguments the device id as well as the host pointer. The host
// pointer is used by the runtime library to identify the current target
// region, so it only has to be unique and not necessarily point to
// anything. It could be the pointer to the outlined function that
// implements the target region, but we aren't using that so that the
// compiler doesn't need to keep that, and could therefore inline the host
// function if proven worthwhile during optimization.
// From this point on, we need to have an ID of the target region defined.
assert(OutlinedFnID && "Invalid outlined function ID!");
// Emit device ID if any.
llvm::Value *DeviceID;
if (Device)
DeviceID = CGF.Builder.CreateIntCast(CGF.EmitScalarExpr(Device),
CGF.Int32Ty, /*isSigned=*/true);
else
DeviceID = CGF.Builder.getInt32(OMP_DEVICEID_UNDEF);
// Emit the number of elements in the offloading arrays.
llvm::Value *PointerNum = CGF.Builder.getInt32(BasePointers.size());
// Return value of the runtime offloading call.
llvm::Value *Return;
auto *NumTeams = emitNumTeamsClauseForTargetDirective(RT, CGF, D);
auto *ThreadLimit = emitThreadLimitClauseForTargetDirective(RT, CGF, D);
// If we have NumTeams defined this means that we have an enclosed teams
// region. Therefore we also expect to have ThreadLimit defined. These two
// values should be defined in the presence of a teams directive, regardless
// of having any clauses associated. If the user is using teams but no
// clauses, these two values will be the default that should be passed to
// the runtime library - a 32-bit integer with the value zero.
if (NumTeams) {
assert(ThreadLimit && "Thread limit expression should be available along "
"with number of teams.");
llvm::Value *OffloadingArgs[] = {
DeviceID, OutlinedFnID,
PointerNum, Info.BasePointersArray,
Info.PointersArray, Info.SizesArray,
Info.MapTypesArray, NumTeams,
ThreadLimit};
Return = CGF.EmitRuntimeCall(
RT.createRuntimeFunction(OMPRTL__tgt_target_teams), OffloadingArgs);
} else {
llvm::Value *OffloadingArgs[] = {
DeviceID, OutlinedFnID,
PointerNum, Info.BasePointersArray,
Info.PointersArray, Info.SizesArray,
Info.MapTypesArray};
Return = CGF.EmitRuntimeCall(RT.createRuntimeFunction(OMPRTL__tgt_target),
OffloadingArgs);
}
CGF.EmitStoreOfScalar(Return, OffloadError);
};
// Notify that the host version must be executed.
auto &&ElseGen = [OffloadError](CodeGenFunction &CGF, PrePostActionTy &) {
CGF.EmitStoreOfScalar(llvm::ConstantInt::get(CGF.Int32Ty, /*V=*/-1u),
OffloadError);
};
// If we have a target function ID it means that we need to support
// offloading, otherwise, just execute on the host. We need to execute on host
// regardless of the conditional in the if clause if, e.g., the user do not
// specify target triples.
if (OutlinedFnID) {
if (IfCond)
emitOMPIfClause(CGF, IfCond, ThenGen, ElseGen);
else {
RegionCodeGenTy ThenRCG(ThenGen);
ThenRCG(CGF);
}
} else {
RegionCodeGenTy ElseRCG(ElseGen);
ElseRCG(CGF);
}
// Check the error code and execute the host version if required.
auto OffloadFailedBlock = CGF.createBasicBlock("omp_offload.failed");
auto OffloadContBlock = CGF.createBasicBlock("omp_offload.cont");
auto OffloadErrorVal = CGF.EmitLoadOfScalar(OffloadError, SourceLocation());
auto Failed = CGF.Builder.CreateIsNotNull(OffloadErrorVal);
CGF.Builder.CreateCondBr(Failed, OffloadFailedBlock, OffloadContBlock);
CGF.EmitBlock(OffloadFailedBlock);
CGF.Builder.CreateCall(OutlinedFn, KernelArgs);
CGF.EmitBranch(OffloadContBlock);
CGF.EmitBlock(OffloadContBlock, /*IsFinished=*/true);
}
void CGOpenMPRuntime::scanForTargetRegionsFunctions(const Stmt *S,
StringRef ParentName) {
if (!S)
return;
// If we find a OMP target directive, codegen the outline function and
// register the result.
// FIXME: Add other directives with target when they become supported.
bool isTargetDirective = isa<OMPTargetDirective>(S);
if (isTargetDirective) {
auto *E = cast<OMPExecutableDirective>(S);
unsigned DeviceID;
unsigned FileID;
unsigned Line;
getTargetEntryUniqueInfo(CGM.getContext(), E->getLocStart(), DeviceID,
FileID, Line);
// Is this a target region that should not be emitted as an entry point? If
// so just signal we are done with this target region.
if (!OffloadEntriesInfoManager.hasTargetRegionEntryInfo(DeviceID, FileID,
ParentName, Line))
return;
llvm::Function *Fn;
llvm::Constant *Addr;
std::tie(Fn, Addr) =
CodeGenFunction::EmitOMPTargetDirectiveOutlinedFunction(
CGM, cast<OMPTargetDirective>(*E), ParentName,
/*isOffloadEntry=*/true);
assert(Fn && Addr && "Target region emission failed.");
return;
}
if (const OMPExecutableDirective *E = dyn_cast<OMPExecutableDirective>(S)) {
if (!E->hasAssociatedStmt())
return;
scanForTargetRegionsFunctions(
cast<CapturedStmt>(E->getAssociatedStmt())->getCapturedStmt(),
ParentName);
return;
}
// If this is a lambda function, look into its body.
if (auto *L = dyn_cast<LambdaExpr>(S))
S = L->getBody();
// Keep looking for target regions recursively.
for (auto *II : S->children())
scanForTargetRegionsFunctions(II, ParentName);
}
bool CGOpenMPRuntime::emitTargetFunctions(GlobalDecl GD) {
auto &FD = *cast<FunctionDecl>(GD.getDecl());
// If emitting code for the host, we do not process FD here. Instead we do
// the normal code generation.
if (!CGM.getLangOpts().OpenMPIsDevice)
return false;
// Try to detect target regions in the function.
scanForTargetRegionsFunctions(FD.getBody(), CGM.getMangledName(GD));
// We should not emit any function other that the ones created during the
// scanning. Therefore, we signal that this function is completely dealt
// with.
return true;
}
bool CGOpenMPRuntime::emitTargetGlobalVariable(GlobalDecl GD) {
if (!CGM.getLangOpts().OpenMPIsDevice)
return false;
// Check if there are Ctors/Dtors in this declaration and look for target
// regions in it. We use the complete variant to produce the kernel name
// mangling.
QualType RDTy = cast<VarDecl>(GD.getDecl())->getType();
if (auto *RD = RDTy->getBaseElementTypeUnsafe()->getAsCXXRecordDecl()) {
for (auto *Ctor : RD->ctors()) {
StringRef ParentName =
CGM.getMangledName(GlobalDecl(Ctor, Ctor_Complete));
scanForTargetRegionsFunctions(Ctor->getBody(), ParentName);
}
auto *Dtor = RD->getDestructor();
if (Dtor) {
StringRef ParentName =
CGM.getMangledName(GlobalDecl(Dtor, Dtor_Complete));
scanForTargetRegionsFunctions(Dtor->getBody(), ParentName);
}
}
// If we are in target mode we do not emit any global (declare target is not
// implemented yet). Therefore we signal that GD was processed in this case.
return true;
}
bool CGOpenMPRuntime::emitTargetGlobal(GlobalDecl GD) {
auto *VD = GD.getDecl();
if (isa<FunctionDecl>(VD))
return emitTargetFunctions(GD);
return emitTargetGlobalVariable(GD);
}
llvm::Function *CGOpenMPRuntime::emitRegistrationFunction() {
// If we have offloading in the current module, we need to emit the entries
// now and register the offloading descriptor.
createOffloadEntriesAndInfoMetadata();
// Create and register the offloading binary descriptors. This is the main
// entity that captures all the information about offloading in the current
// compilation unit.
return createOffloadingBinaryDescriptorRegistration();
}
void CGOpenMPRuntime::emitTeamsCall(CodeGenFunction &CGF,
const OMPExecutableDirective &D,
SourceLocation Loc,
llvm::Value *OutlinedFn,
ArrayRef<llvm::Value *> CapturedVars) {
if (!CGF.HaveInsertPoint())
return;
auto *RTLoc = emitUpdateLocation(CGF, Loc);
CodeGenFunction::RunCleanupsScope Scope(CGF);
// Build call __kmpc_fork_teams(loc, n, microtask, var1, .., varn);
llvm::Value *Args[] = {
RTLoc,
CGF.Builder.getInt32(CapturedVars.size()), // Number of captured vars
CGF.Builder.CreateBitCast(OutlinedFn, getKmpc_MicroPointerTy())};
llvm::SmallVector<llvm::Value *, 16> RealArgs;
RealArgs.append(std::begin(Args), std::end(Args));
RealArgs.append(CapturedVars.begin(), CapturedVars.end());
auto RTLFn = createRuntimeFunction(OMPRTL__kmpc_fork_teams);
CGF.EmitRuntimeCall(RTLFn, RealArgs);
}
void CGOpenMPRuntime::emitNumTeamsClause(CodeGenFunction &CGF,
const Expr *NumTeams,
const Expr *ThreadLimit,
SourceLocation Loc) {
if (!CGF.HaveInsertPoint())
return;
auto *RTLoc = emitUpdateLocation(CGF, Loc);
llvm::Value *NumTeamsVal =
(NumTeams)
? CGF.Builder.CreateIntCast(CGF.EmitScalarExpr(NumTeams),
CGF.CGM.Int32Ty, /* isSigned = */ true)
: CGF.Builder.getInt32(0);
llvm::Value *ThreadLimitVal =
(ThreadLimit)
? CGF.Builder.CreateIntCast(CGF.EmitScalarExpr(ThreadLimit),
CGF.CGM.Int32Ty, /* isSigned = */ true)
: CGF.Builder.getInt32(0);
// Build call __kmpc_push_num_teamss(&loc, global_tid, num_teams, thread_limit)
llvm::Value *PushNumTeamsArgs[] = {RTLoc, getThreadID(CGF, Loc), NumTeamsVal,
ThreadLimitVal};
CGF.EmitRuntimeCall(createRuntimeFunction(OMPRTL__kmpc_push_num_teams),
PushNumTeamsArgs);
}
void CGOpenMPRuntime::emitTargetDataCalls(
CodeGenFunction &CGF, const OMPExecutableDirective &D, const Expr *IfCond,
const Expr *Device, const RegionCodeGenTy &CodeGen, TargetDataInfo &Info) {
if (!CGF.HaveInsertPoint())
return;
// Action used to replace the default codegen action and turn privatization
// off.
PrePostActionTy NoPrivAction;
// Generate the code for the opening of the data environment. Capture all the
// arguments of the runtime call by reference because they are used in the
// closing of the region.
auto &&BeginThenGen = [&D, &CGF, Device, &Info, &CodeGen, &NoPrivAction](
CodeGenFunction &CGF, PrePostActionTy &) {
// Fill up the arrays with all the mapped variables.
MappableExprsHandler::MapBaseValuesArrayTy BasePointers;
MappableExprsHandler::MapValuesArrayTy Pointers;
MappableExprsHandler::MapValuesArrayTy Sizes;
MappableExprsHandler::MapFlagsArrayTy MapTypes;
// Get map clause information.
MappableExprsHandler MCHandler(D, CGF);
MCHandler.generateAllInfo(BasePointers, Pointers, Sizes, MapTypes);
// Fill up the arrays and create the arguments.
emitOffloadingArrays(CGF, BasePointers, Pointers, Sizes, MapTypes, Info);
llvm::Value *BasePointersArrayArg = nullptr;
llvm::Value *PointersArrayArg = nullptr;
llvm::Value *SizesArrayArg = nullptr;
llvm::Value *MapTypesArrayArg = nullptr;
emitOffloadingArraysArgument(CGF, BasePointersArrayArg, PointersArrayArg,
SizesArrayArg, MapTypesArrayArg, Info);
// Emit device ID if any.
llvm::Value *DeviceID = nullptr;
if (Device)
DeviceID = CGF.Builder.CreateIntCast(CGF.EmitScalarExpr(Device),
CGF.Int32Ty, /*isSigned=*/true);
else
DeviceID = CGF.Builder.getInt32(OMP_DEVICEID_UNDEF);
// Emit the number of elements in the offloading arrays.
auto *PointerNum = CGF.Builder.getInt32(Info.NumberOfPtrs);
llvm::Value *OffloadingArgs[] = {
DeviceID, PointerNum, BasePointersArrayArg,
PointersArrayArg, SizesArrayArg, MapTypesArrayArg};
auto &RT = CGF.CGM.getOpenMPRuntime();
CGF.EmitRuntimeCall(RT.createRuntimeFunction(OMPRTL__tgt_target_data_begin),
OffloadingArgs);
// If device pointer privatization is required, emit the body of the region
// here. It will have to be duplicated: with and without privatization.
if (!Info.CaptureDeviceAddrMap.empty())
CodeGen(CGF);
};
// Generate code for the closing of the data region.
auto &&EndThenGen = [&CGF, Device, &Info](CodeGenFunction &CGF,
PrePostActionTy &) {
assert(Info.isValid() && "Invalid data environment closing arguments.");
llvm::Value *BasePointersArrayArg = nullptr;
llvm::Value *PointersArrayArg = nullptr;
llvm::Value *SizesArrayArg = nullptr;
llvm::Value *MapTypesArrayArg = nullptr;
emitOffloadingArraysArgument(CGF, BasePointersArrayArg, PointersArrayArg,
SizesArrayArg, MapTypesArrayArg, Info);
// Emit device ID if any.
llvm::Value *DeviceID = nullptr;
if (Device)
DeviceID = CGF.Builder.CreateIntCast(CGF.EmitScalarExpr(Device),
CGF.Int32Ty, /*isSigned=*/true);
else
DeviceID = CGF.Builder.getInt32(OMP_DEVICEID_UNDEF);
// Emit the number of elements in the offloading arrays.
auto *PointerNum = CGF.Builder.getInt32(Info.NumberOfPtrs);
llvm::Value *OffloadingArgs[] = {
DeviceID, PointerNum, BasePointersArrayArg,
PointersArrayArg, SizesArrayArg, MapTypesArrayArg};
auto &RT = CGF.CGM.getOpenMPRuntime();
CGF.EmitRuntimeCall(RT.createRuntimeFunction(OMPRTL__tgt_target_data_end),
OffloadingArgs);
};
// If we need device pointer privatization, we need to emit the body of the
// region with no privatization in the 'else' branch of the conditional.
// Otherwise, we don't have to do anything.
auto &&BeginElseGen = [&Info, &CodeGen, &NoPrivAction](CodeGenFunction &CGF,
PrePostActionTy &) {
if (!Info.CaptureDeviceAddrMap.empty()) {
CodeGen.setAction(NoPrivAction);
CodeGen(CGF);
}
};
// We don't have to do anything to close the region if the if clause evaluates
// to false.
auto &&EndElseGen = [](CodeGenFunction &CGF, PrePostActionTy &) {};
if (IfCond) {
emitOMPIfClause(CGF, IfCond, BeginThenGen, BeginElseGen);
} else {
RegionCodeGenTy RCG(BeginThenGen);
RCG(CGF);
}
// If we don't require privatization of device pointers, we emit the body in
// between the runtime calls. This avoids duplicating the body code.
if (Info.CaptureDeviceAddrMap.empty()) {
CodeGen.setAction(NoPrivAction);
CodeGen(CGF);
}
if (IfCond) {
emitOMPIfClause(CGF, IfCond, EndThenGen, EndElseGen);
} else {
RegionCodeGenTy RCG(EndThenGen);
RCG(CGF);
}
}
void CGOpenMPRuntime::emitTargetDataStandAloneCall(
CodeGenFunction &CGF, const OMPExecutableDirective &D, const Expr *IfCond,
const Expr *Device) {
if (!CGF.HaveInsertPoint())
return;
assert((isa<OMPTargetEnterDataDirective>(D) ||
isa<OMPTargetExitDataDirective>(D) ||
isa<OMPTargetUpdateDirective>(D)) &&
"Expecting either target enter, exit data, or update directives.");
// Generate the code for the opening of the data environment.
auto &&ThenGen = [&D, &CGF, Device](CodeGenFunction &CGF, PrePostActionTy &) {
// Fill up the arrays with all the mapped variables.
MappableExprsHandler::MapBaseValuesArrayTy BasePointers;
MappableExprsHandler::MapValuesArrayTy Pointers;
MappableExprsHandler::MapValuesArrayTy Sizes;
MappableExprsHandler::MapFlagsArrayTy MapTypes;
// Get map clause information.
MappableExprsHandler MEHandler(D, CGF);
MEHandler.generateAllInfo(BasePointers, Pointers, Sizes, MapTypes);
// Fill up the arrays and create the arguments.
TargetDataInfo Info;
emitOffloadingArrays(CGF, BasePointers, Pointers, Sizes, MapTypes, Info);
emitOffloadingArraysArgument(CGF, Info.BasePointersArray,
Info.PointersArray, Info.SizesArray,
Info.MapTypesArray, Info);
// Emit device ID if any.
llvm::Value *DeviceID = nullptr;
if (Device)
DeviceID = CGF.Builder.CreateIntCast(CGF.EmitScalarExpr(Device),
CGF.Int32Ty, /*isSigned=*/true);
else
DeviceID = CGF.Builder.getInt32(OMP_DEVICEID_UNDEF);
// Emit the number of elements in the offloading arrays.
auto *PointerNum = CGF.Builder.getInt32(BasePointers.size());
llvm::Value *OffloadingArgs[] = {
DeviceID, PointerNum, Info.BasePointersArray,
Info.PointersArray, Info.SizesArray, Info.MapTypesArray};
auto &RT = CGF.CGM.getOpenMPRuntime();
// Select the right runtime function call for each expected standalone
// directive.
OpenMPRTLFunction RTLFn;
switch (D.getDirectiveKind()) {
default:
llvm_unreachable("Unexpected standalone target data directive.");
break;
case OMPD_target_enter_data:
RTLFn = OMPRTL__tgt_target_data_begin;
break;
case OMPD_target_exit_data:
RTLFn = OMPRTL__tgt_target_data_end;
break;
case OMPD_target_update:
RTLFn = OMPRTL__tgt_target_data_update;
break;
}
CGF.EmitRuntimeCall(RT.createRuntimeFunction(RTLFn), OffloadingArgs);
};
// In the event we get an if clause, we don't have to take any action on the
// else side.
auto &&ElseGen = [](CodeGenFunction &CGF, PrePostActionTy &) {};
if (IfCond) {
emitOMPIfClause(CGF, IfCond, ThenGen, ElseGen);
} else {
RegionCodeGenTy ThenGenRCG(ThenGen);
ThenGenRCG(CGF);
}
}
namespace {
/// Kind of parameter in a function with 'declare simd' directive.
enum ParamKindTy { LinearWithVarStride, Linear, Uniform, Vector };
/// Attribute set of the parameter.
struct ParamAttrTy {
ParamKindTy Kind = Vector;
llvm::APSInt StrideOrArg;
llvm::APSInt Alignment;
};
} // namespace
static unsigned evaluateCDTSize(const FunctionDecl *FD,
ArrayRef<ParamAttrTy> ParamAttrs) {
// Every vector variant of a SIMD-enabled function has a vector length (VLEN).
// If OpenMP clause "simdlen" is used, the VLEN is the value of the argument
// of that clause. The VLEN value must be power of 2.
// In other case the notion of the function`s "characteristic data type" (CDT)
// is used to compute the vector length.
// CDT is defined in the following order:
// a) For non-void function, the CDT is the return type.
// b) If the function has any non-uniform, non-linear parameters, then the
// CDT is the type of the first such parameter.
// c) If the CDT determined by a) or b) above is struct, union, or class
// type which is pass-by-value (except for the type that maps to the
// built-in complex data type), the characteristic data type is int.
// d) If none of the above three cases is applicable, the CDT is int.
// The VLEN is then determined based on the CDT and the size of vector
// register of that ISA for which current vector version is generated. The
// VLEN is computed using the formula below:
// VLEN = sizeof(vector_register) / sizeof(CDT),
// where vector register size specified in section 3.2.1 Registers and the
// Stack Frame of original AMD64 ABI document.
QualType RetType = FD->getReturnType();
if (RetType.isNull())
return 0;
ASTContext &C = FD->getASTContext();
QualType CDT;
if (!RetType.isNull() && !RetType->isVoidType())
CDT = RetType;
else {
unsigned Offset = 0;
if (auto *MD = dyn_cast<CXXMethodDecl>(FD)) {
if (ParamAttrs[Offset].Kind == Vector)
CDT = C.getPointerType(C.getRecordType(MD->getParent()));
++Offset;
}
if (CDT.isNull()) {
for (unsigned I = 0, E = FD->getNumParams(); I < E; ++I) {
if (ParamAttrs[I + Offset].Kind == Vector) {
CDT = FD->getParamDecl(I)->getType();
break;
}
}
}
}
if (CDT.isNull())
CDT = C.IntTy;
CDT = CDT->getCanonicalTypeUnqualified();
if (CDT->isRecordType() || CDT->isUnionType())
CDT = C.IntTy;
return C.getTypeSize(CDT);
}
static void
emitX86DeclareSimdFunction(const FunctionDecl *FD, llvm::Function *Fn,
const llvm::APSInt &VLENVal,
ArrayRef<ParamAttrTy> ParamAttrs,
OMPDeclareSimdDeclAttr::BranchStateTy State) {
struct ISADataTy {
char ISA;
unsigned VecRegSize;
};
ISADataTy ISAData[] = {
{
'b', 128
}, // SSE
{
'c', 256
}, // AVX
{
'd', 256
}, // AVX2
{
'e', 512
}, // AVX512
};
llvm::SmallVector<char, 2> Masked;
switch (State) {
case OMPDeclareSimdDeclAttr::BS_Undefined:
Masked.push_back('N');
Masked.push_back('M');
break;
case OMPDeclareSimdDeclAttr::BS_Notinbranch:
Masked.push_back('N');
break;
case OMPDeclareSimdDeclAttr::BS_Inbranch:
Masked.push_back('M');
break;
}
for (auto Mask : Masked) {
for (auto &Data : ISAData) {
SmallString<256> Buffer;
llvm::raw_svector_ostream Out(Buffer);
Out << "_ZGV" << Data.ISA << Mask;
if (!VLENVal) {
Out << llvm::APSInt::getUnsigned(Data.VecRegSize /
evaluateCDTSize(FD, ParamAttrs));
} else
Out << VLENVal;
for (auto &ParamAttr : ParamAttrs) {
switch (ParamAttr.Kind){
case LinearWithVarStride:
Out << 's' << ParamAttr.StrideOrArg;
break;
case Linear:
Out << 'l';
if (!!ParamAttr.StrideOrArg)
Out << ParamAttr.StrideOrArg;
break;
case Uniform:
Out << 'u';
break;
case Vector:
Out << 'v';
break;
}
if (!!ParamAttr.Alignment)
Out << 'a' << ParamAttr.Alignment;
}
Out << '_' << Fn->getName();
Fn->addFnAttr(Out.str());
}
}
}
void CGOpenMPRuntime::emitDeclareSimdFunction(const FunctionDecl *FD,
llvm::Function *Fn) {
ASTContext &C = CGM.getContext();
FD = FD->getCanonicalDecl();
// Map params to their positions in function decl.
llvm::DenseMap<const Decl *, unsigned> ParamPositions;
if (isa<CXXMethodDecl>(FD))
ParamPositions.insert({FD, 0});
unsigned ParamPos = ParamPositions.size();
for (auto *P : FD->parameters()) {
ParamPositions.insert({P->getCanonicalDecl(), ParamPos});
++ParamPos;
}
for (auto *Attr : FD->specific_attrs<OMPDeclareSimdDeclAttr>()) {
llvm::SmallVector<ParamAttrTy, 8> ParamAttrs(ParamPositions.size());
// Mark uniform parameters.
for (auto *E : Attr->uniforms()) {
E = E->IgnoreParenImpCasts();
unsigned Pos;
if (isa<CXXThisExpr>(E))
Pos = ParamPositions[FD];
else {
auto *PVD = cast<ParmVarDecl>(cast<DeclRefExpr>(E)->getDecl())
->getCanonicalDecl();
Pos = ParamPositions[PVD];
}
ParamAttrs[Pos].Kind = Uniform;
}
// Get alignment info.
auto NI = Attr->alignments_begin();
for (auto *E : Attr->aligneds()) {
E = E->IgnoreParenImpCasts();
unsigned Pos;
QualType ParmTy;
if (isa<CXXThisExpr>(E)) {
Pos = ParamPositions[FD];
ParmTy = E->getType();
} else {
auto *PVD = cast<ParmVarDecl>(cast<DeclRefExpr>(E)->getDecl())
->getCanonicalDecl();
Pos = ParamPositions[PVD];
ParmTy = PVD->getType();
}
ParamAttrs[Pos].Alignment =
(*NI) ? (*NI)->EvaluateKnownConstInt(C)
: llvm::APSInt::getUnsigned(
C.toCharUnitsFromBits(C.getOpenMPDefaultSimdAlign(ParmTy))
.getQuantity());
++NI;
}
// Mark linear parameters.
auto SI = Attr->steps_begin();
auto MI = Attr->modifiers_begin();
for (auto *E : Attr->linears()) {
E = E->IgnoreParenImpCasts();
unsigned Pos;
if (isa<CXXThisExpr>(E))
Pos = ParamPositions[FD];
else {
auto *PVD = cast<ParmVarDecl>(cast<DeclRefExpr>(E)->getDecl())
->getCanonicalDecl();
Pos = ParamPositions[PVD];
}
auto &ParamAttr = ParamAttrs[Pos];
ParamAttr.Kind = Linear;
if (*SI) {
if (!(*SI)->EvaluateAsInt(ParamAttr.StrideOrArg, C,
Expr::SE_AllowSideEffects)) {
if (auto *DRE = cast<DeclRefExpr>((*SI)->IgnoreParenImpCasts())) {
if (auto *StridePVD = cast<ParmVarDecl>(DRE->getDecl())) {
ParamAttr.Kind = LinearWithVarStride;
ParamAttr.StrideOrArg = llvm::APSInt::getUnsigned(
ParamPositions[StridePVD->getCanonicalDecl()]);
}
}
}
}
++SI;
++MI;
}
llvm::APSInt VLENVal;
if (const Expr *VLEN = Attr->getSimdlen())
VLENVal = VLEN->EvaluateKnownConstInt(C);
OMPDeclareSimdDeclAttr::BranchStateTy State = Attr->getBranchState();
if (CGM.getTriple().getArch() == llvm::Triple::x86 ||
CGM.getTriple().getArch() == llvm::Triple::x86_64)
emitX86DeclareSimdFunction(FD, Fn, VLENVal, ParamAttrs, State);
}
}
namespace {
/// Cleanup action for doacross support.
class DoacrossCleanupTy final : public EHScopeStack::Cleanup {
public:
static const int DoacrossFinArgs = 2;
private:
llvm::Value *RTLFn;
llvm::Value *Args[DoacrossFinArgs];
public:
DoacrossCleanupTy(llvm::Value *RTLFn, ArrayRef<llvm::Value *> CallArgs)
: RTLFn(RTLFn) {
assert(CallArgs.size() == DoacrossFinArgs);
std::copy(CallArgs.begin(), CallArgs.end(), std::begin(Args));
}
void Emit(CodeGenFunction &CGF, Flags /*flags*/) override {
if (!CGF.HaveInsertPoint())
return;
CGF.EmitRuntimeCall(RTLFn, Args);
}
};
} // namespace
void CGOpenMPRuntime::emitDoacrossInit(CodeGenFunction &CGF,
const OMPLoopDirective &D) {
if (!CGF.HaveInsertPoint())
return;
ASTContext &C = CGM.getContext();
QualType Int64Ty = C.getIntTypeForBitwidth(/*DestWidth=*/64, /*Signed=*/true);
RecordDecl *RD;
if (KmpDimTy.isNull()) {
// Build struct kmp_dim { // loop bounds info casted to kmp_int64
// kmp_int64 lo; // lower
// kmp_int64 up; // upper
// kmp_int64 st; // stride
// };
RD = C.buildImplicitRecord("kmp_dim");
RD->startDefinition();
addFieldToRecordDecl(C, RD, Int64Ty);
addFieldToRecordDecl(C, RD, Int64Ty);
addFieldToRecordDecl(C, RD, Int64Ty);
RD->completeDefinition();
KmpDimTy = C.getRecordType(RD);
} else
RD = cast<RecordDecl>(KmpDimTy->getAsTagDecl());
Address DimsAddr = CGF.CreateMemTemp(KmpDimTy, "dims");
CGF.EmitNullInitialization(DimsAddr, KmpDimTy);
enum { LowerFD = 0, UpperFD, StrideFD };
// Fill dims with data.
LValue DimsLVal = CGF.MakeAddrLValue(DimsAddr, KmpDimTy);
// dims.upper = num_iterations;
LValue UpperLVal =
CGF.EmitLValueForField(DimsLVal, *std::next(RD->field_begin(), UpperFD));
llvm::Value *NumIterVal = CGF.EmitScalarConversion(
CGF.EmitScalarExpr(D.getNumIterations()), D.getNumIterations()->getType(),
Int64Ty, D.getNumIterations()->getExprLoc());
CGF.EmitStoreOfScalar(NumIterVal, UpperLVal);
// dims.stride = 1;
LValue StrideLVal =
CGF.EmitLValueForField(DimsLVal, *std::next(RD->field_begin(), StrideFD));
CGF.EmitStoreOfScalar(llvm::ConstantInt::getSigned(CGM.Int64Ty, /*V=*/1),
StrideLVal);
// Build call void __kmpc_doacross_init(ident_t *loc, kmp_int32 gtid,
// kmp_int32 num_dims, struct kmp_dim * dims);
llvm::Value *Args[] = {emitUpdateLocation(CGF, D.getLocStart()),
getThreadID(CGF, D.getLocStart()),
llvm::ConstantInt::getSigned(CGM.Int32Ty, 1),
CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
DimsAddr.getPointer(), CGM.VoidPtrTy)};
llvm::Value *RTLFn = createRuntimeFunction(OMPRTL__kmpc_doacross_init);
CGF.EmitRuntimeCall(RTLFn, Args);
llvm::Value *FiniArgs[DoacrossCleanupTy::DoacrossFinArgs] = {
emitUpdateLocation(CGF, D.getLocEnd()), getThreadID(CGF, D.getLocEnd())};
llvm::Value *FiniRTLFn = createRuntimeFunction(OMPRTL__kmpc_doacross_fini);
CGF.EHStack.pushCleanup<DoacrossCleanupTy>(NormalAndEHCleanup, FiniRTLFn,
llvm::makeArrayRef(FiniArgs));
}
void CGOpenMPRuntime::emitDoacrossOrdered(CodeGenFunction &CGF,
const OMPDependClause *C) {
QualType Int64Ty =
CGM.getContext().getIntTypeForBitwidth(/*DestWidth=*/64, /*Signed=*/1);
const Expr *CounterVal = C->getCounterValue();
assert(CounterVal);
llvm::Value *CntVal = CGF.EmitScalarConversion(CGF.EmitScalarExpr(CounterVal),
CounterVal->getType(), Int64Ty,
CounterVal->getExprLoc());
Address CntAddr = CGF.CreateMemTemp(Int64Ty, ".cnt.addr");
CGF.EmitStoreOfScalar(CntVal, CntAddr, /*Volatile=*/false, Int64Ty);
llvm::Value *Args[] = {emitUpdateLocation(CGF, C->getLocStart()),
getThreadID(CGF, C->getLocStart()),
CntAddr.getPointer()};
llvm::Value *RTLFn;
if (C->getDependencyKind() == OMPC_DEPEND_source)
RTLFn = createRuntimeFunction(OMPRTL__kmpc_doacross_post);
else {
assert(C->getDependencyKind() == OMPC_DEPEND_sink);
RTLFn = createRuntimeFunction(OMPRTL__kmpc_doacross_wait);
}
CGF.EmitRuntimeCall(RTLFn, Args);
}
Index: projects/clang400-import/contrib/llvm/tools/clang/lib/Frontend/InitPreprocessor.cpp
===================================================================
--- projects/clang400-import/contrib/llvm/tools/clang/lib/Frontend/InitPreprocessor.cpp (revision 314268)
+++ projects/clang400-import/contrib/llvm/tools/clang/lib/Frontend/InitPreprocessor.cpp (revision 314269)
@@ -1,1100 +1,1102 @@
//===--- InitPreprocessor.cpp - PP initialization code. ---------*- C++ -*-===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
// This file implements the clang::InitializePreprocessor function.
//
//===----------------------------------------------------------------------===//
#include "clang/Basic/FileManager.h"
#include "clang/Basic/MacroBuilder.h"
#include "clang/Basic/SourceManager.h"
#include "clang/Basic/TargetInfo.h"
#include "clang/Basic/Version.h"
#include "clang/Frontend/FrontendDiagnostic.h"
#include "clang/Frontend/FrontendOptions.h"
#include "clang/Frontend/Utils.h"
#include "clang/Lex/HeaderSearch.h"
#include "clang/Lex/PTHManager.h"
#include "clang/Lex/Preprocessor.h"
#include "clang/Lex/PreprocessorOptions.h"
#include "clang/Serialization/ASTReader.h"
#include "llvm/ADT/APFloat.h"
using namespace clang;
static bool MacroBodyEndsInBackslash(StringRef MacroBody) {
while (!MacroBody.empty() && isWhitespace(MacroBody.back()))
MacroBody = MacroBody.drop_back();
return !MacroBody.empty() && MacroBody.back() == '\\';
}
// Append a #define line to Buf for Macro. Macro should be of the form XXX,
// in which case we emit "#define XXX 1" or "XXX=Y z W" in which case we emit
// "#define XXX Y z W". To get a #define with no value, use "XXX=".
static void DefineBuiltinMacro(MacroBuilder &Builder, StringRef Macro,
DiagnosticsEngine &Diags) {
std::pair<StringRef, StringRef> MacroPair = Macro.split('=');
StringRef MacroName = MacroPair.first;
StringRef MacroBody = MacroPair.second;
if (MacroName.size() != Macro.size()) {
// Per GCC -D semantics, the macro ends at \n if it exists.
StringRef::size_type End = MacroBody.find_first_of("\n\r");
if (End != StringRef::npos)
Diags.Report(diag::warn_fe_macro_contains_embedded_newline)
<< MacroName;
MacroBody = MacroBody.substr(0, End);
// We handle macro bodies which end in a backslash by appending an extra
// backslash+newline. This makes sure we don't accidentally treat the
// backslash as a line continuation marker.
if (MacroBodyEndsInBackslash(MacroBody))
Builder.defineMacro(MacroName, Twine(MacroBody) + "\\\n");
else
Builder.defineMacro(MacroName, MacroBody);
} else {
// Push "macroname 1".
Builder.defineMacro(Macro);
}
}
/// AddImplicitInclude - Add an implicit \#include of the specified file to the
/// predefines buffer.
/// As these includes are generated by -include arguments the header search
/// logic is going to search relatively to the current working directory.
static void AddImplicitInclude(MacroBuilder &Builder, StringRef File) {
Builder.append(Twine("#include \"") + File + "\"");
}
static void AddImplicitIncludeMacros(MacroBuilder &Builder, StringRef File) {
Builder.append(Twine("#__include_macros \"") + File + "\"");
// Marker token to stop the __include_macros fetch loop.
Builder.append("##"); // ##?
}
/// AddImplicitIncludePTH - Add an implicit \#include using the original file
/// used to generate a PTH cache.
static void AddImplicitIncludePTH(MacroBuilder &Builder, Preprocessor &PP,
StringRef ImplicitIncludePTH) {
PTHManager *P = PP.getPTHManager();
// Null check 'P' in the corner case where it couldn't be created.
const char *OriginalFile = P ? P->getOriginalSourceFile() : nullptr;
if (!OriginalFile) {
PP.getDiagnostics().Report(diag::err_fe_pth_file_has_no_source_header)
<< ImplicitIncludePTH;
return;
}
AddImplicitInclude(Builder, OriginalFile);
}
/// \brief Add an implicit \#include using the original file used to generate
/// a PCH file.
static void AddImplicitIncludePCH(MacroBuilder &Builder, Preprocessor &PP,
const PCHContainerReader &PCHContainerRdr,
StringRef ImplicitIncludePCH) {
std::string OriginalFile =
ASTReader::getOriginalSourceFile(ImplicitIncludePCH, PP.getFileManager(),
PCHContainerRdr, PP.getDiagnostics());
if (OriginalFile.empty())
return;
AddImplicitInclude(Builder, OriginalFile);
}
/// PickFP - This is used to pick a value based on the FP semantics of the
/// specified FP model.
template <typename T>
static T PickFP(const llvm::fltSemantics *Sem, T IEEESingleVal,
T IEEEDoubleVal, T X87DoubleExtendedVal, T PPCDoubleDoubleVal,
T IEEEQuadVal) {
if (Sem == (const llvm::fltSemantics*)&llvm::APFloat::IEEEsingle())
return IEEESingleVal;
if (Sem == (const llvm::fltSemantics*)&llvm::APFloat::IEEEdouble())
return IEEEDoubleVal;
if (Sem == (const llvm::fltSemantics*)&llvm::APFloat::x87DoubleExtended())
return X87DoubleExtendedVal;
if (Sem == (const llvm::fltSemantics*)&llvm::APFloat::PPCDoubleDouble())
return PPCDoubleDoubleVal;
assert(Sem == (const llvm::fltSemantics*)&llvm::APFloat::IEEEquad());
return IEEEQuadVal;
}
static void DefineFloatMacros(MacroBuilder &Builder, StringRef Prefix,
const llvm::fltSemantics *Sem, StringRef Ext) {
const char *DenormMin, *Epsilon, *Max, *Min;
DenormMin = PickFP(Sem, "1.40129846e-45", "4.9406564584124654e-324",
"3.64519953188247460253e-4951",
"4.94065645841246544176568792868221e-324",
"6.47517511943802511092443895822764655e-4966");
int Digits = PickFP(Sem, 6, 15, 18, 31, 33);
int DecimalDigits = PickFP(Sem, 9, 17, 21, 33, 36);
Epsilon = PickFP(Sem, "1.19209290e-7", "2.2204460492503131e-16",
"1.08420217248550443401e-19",
"4.94065645841246544176568792868221e-324",
"1.92592994438723585305597794258492732e-34");
int MantissaDigits = PickFP(Sem, 24, 53, 64, 106, 113);
int Min10Exp = PickFP(Sem, -37, -307, -4931, -291, -4931);
int Max10Exp = PickFP(Sem, 38, 308, 4932, 308, 4932);
int MinExp = PickFP(Sem, -125, -1021, -16381, -968, -16381);
int MaxExp = PickFP(Sem, 128, 1024, 16384, 1024, 16384);
Min = PickFP(Sem, "1.17549435e-38", "2.2250738585072014e-308",
"3.36210314311209350626e-4932",
"2.00416836000897277799610805135016e-292",
"3.36210314311209350626267781732175260e-4932");
Max = PickFP(Sem, "3.40282347e+38", "1.7976931348623157e+308",
"1.18973149535723176502e+4932",
"1.79769313486231580793728971405301e+308",
"1.18973149535723176508575932662800702e+4932");
SmallString<32> DefPrefix;
DefPrefix = "__";
DefPrefix += Prefix;
DefPrefix += "_";
Builder.defineMacro(DefPrefix + "DENORM_MIN__", Twine(DenormMin)+Ext);
Builder.defineMacro(DefPrefix + "HAS_DENORM__");
Builder.defineMacro(DefPrefix + "DIG__", Twine(Digits));
Builder.defineMacro(DefPrefix + "DECIMAL_DIG__", Twine(DecimalDigits));
Builder.defineMacro(DefPrefix + "EPSILON__", Twine(Epsilon)+Ext);
Builder.defineMacro(DefPrefix + "HAS_INFINITY__");
Builder.defineMacro(DefPrefix + "HAS_QUIET_NAN__");
Builder.defineMacro(DefPrefix + "MANT_DIG__", Twine(MantissaDigits));
Builder.defineMacro(DefPrefix + "MAX_10_EXP__", Twine(Max10Exp));
Builder.defineMacro(DefPrefix + "MAX_EXP__", Twine(MaxExp));
Builder.defineMacro(DefPrefix + "MAX__", Twine(Max)+Ext);
Builder.defineMacro(DefPrefix + "MIN_10_EXP__","("+Twine(Min10Exp)+")");
Builder.defineMacro(DefPrefix + "MIN_EXP__", "("+Twine(MinExp)+")");
Builder.defineMacro(DefPrefix + "MIN__", Twine(Min)+Ext);
}
/// DefineTypeSize - Emit a macro to the predefines buffer that declares a macro
/// named MacroName with the max value for a type with width 'TypeWidth' a
/// signedness of 'isSigned' and with a value suffix of 'ValSuffix' (e.g. LL).
static void DefineTypeSize(const Twine &MacroName, unsigned TypeWidth,
StringRef ValSuffix, bool isSigned,
MacroBuilder &Builder) {
llvm::APInt MaxVal = isSigned ? llvm::APInt::getSignedMaxValue(TypeWidth)
: llvm::APInt::getMaxValue(TypeWidth);
Builder.defineMacro(MacroName, MaxVal.toString(10, isSigned) + ValSuffix);
}
/// DefineTypeSize - An overloaded helper that uses TargetInfo to determine
/// the width, suffix, and signedness of the given type
static void DefineTypeSize(const Twine &MacroName, TargetInfo::IntType Ty,
const TargetInfo &TI, MacroBuilder &Builder) {
DefineTypeSize(MacroName, TI.getTypeWidth(Ty), TI.getTypeConstantSuffix(Ty),
TI.isTypeSigned(Ty), Builder);
}
static void DefineFmt(const Twine &Prefix, TargetInfo::IntType Ty,
const TargetInfo &TI, MacroBuilder &Builder) {
bool IsSigned = TI.isTypeSigned(Ty);
StringRef FmtModifier = TI.getTypeFormatModifier(Ty);
for (const char *Fmt = IsSigned ? "di" : "ouxX"; *Fmt; ++Fmt) {
Builder.defineMacro(Prefix + "_FMT" + Twine(*Fmt) + "__",
Twine("\"") + FmtModifier + Twine(*Fmt) + "\"");
}
}
static void DefineType(const Twine &MacroName, TargetInfo::IntType Ty,
MacroBuilder &Builder) {
Builder.defineMacro(MacroName, TargetInfo::getTypeName(Ty));
}
static void DefineTypeWidth(StringRef MacroName, TargetInfo::IntType Ty,
const TargetInfo &TI, MacroBuilder &Builder) {
Builder.defineMacro(MacroName, Twine(TI.getTypeWidth(Ty)));
}
static void DefineTypeSizeof(StringRef MacroName, unsigned BitWidth,
const TargetInfo &TI, MacroBuilder &Builder) {
Builder.defineMacro(MacroName,
Twine(BitWidth / TI.getCharWidth()));
}
static void DefineExactWidthIntType(TargetInfo::IntType Ty,
const TargetInfo &TI,
MacroBuilder &Builder) {
int TypeWidth = TI.getTypeWidth(Ty);
bool IsSigned = TI.isTypeSigned(Ty);
// Use the target specified int64 type, when appropriate, so that [u]int64_t
// ends up being defined in terms of the correct type.
if (TypeWidth == 64)
Ty = IsSigned ? TI.getInt64Type() : TI.getUInt64Type();
const char *Prefix = IsSigned ? "__INT" : "__UINT";
DefineType(Prefix + Twine(TypeWidth) + "_TYPE__", Ty, Builder);
DefineFmt(Prefix + Twine(TypeWidth), Ty, TI, Builder);
StringRef ConstSuffix(TI.getTypeConstantSuffix(Ty));
Builder.defineMacro(Prefix + Twine(TypeWidth) + "_C_SUFFIX__", ConstSuffix);
}
static void DefineExactWidthIntTypeSize(TargetInfo::IntType Ty,
const TargetInfo &TI,
MacroBuilder &Builder) {
int TypeWidth = TI.getTypeWidth(Ty);
bool IsSigned = TI.isTypeSigned(Ty);
// Use the target specified int64 type, when appropriate, so that [u]int64_t
// ends up being defined in terms of the correct type.
if (TypeWidth == 64)
Ty = IsSigned ? TI.getInt64Type() : TI.getUInt64Type();
const char *Prefix = IsSigned ? "__INT" : "__UINT";
DefineTypeSize(Prefix + Twine(TypeWidth) + "_MAX__", Ty, TI, Builder);
}
static void DefineLeastWidthIntType(unsigned TypeWidth, bool IsSigned,
const TargetInfo &TI,
MacroBuilder &Builder) {
TargetInfo::IntType Ty = TI.getLeastIntTypeByWidth(TypeWidth, IsSigned);
if (Ty == TargetInfo::NoInt)
return;
const char *Prefix = IsSigned ? "__INT_LEAST" : "__UINT_LEAST";
DefineType(Prefix + Twine(TypeWidth) + "_TYPE__", Ty, Builder);
DefineTypeSize(Prefix + Twine(TypeWidth) + "_MAX__", Ty, TI, Builder);
DefineFmt(Prefix + Twine(TypeWidth), Ty, TI, Builder);
}
static void DefineFastIntType(unsigned TypeWidth, bool IsSigned,
const TargetInfo &TI, MacroBuilder &Builder) {
// stdint.h currently defines the fast int types as equivalent to the least
// types.
TargetInfo::IntType Ty = TI.getLeastIntTypeByWidth(TypeWidth, IsSigned);
if (Ty == TargetInfo::NoInt)
return;
const char *Prefix = IsSigned ? "__INT_FAST" : "__UINT_FAST";
DefineType(Prefix + Twine(TypeWidth) + "_TYPE__", Ty, Builder);
DefineTypeSize(Prefix + Twine(TypeWidth) + "_MAX__", Ty, TI, Builder);
DefineFmt(Prefix + Twine(TypeWidth), Ty, TI, Builder);
}
/// Get the value the ATOMIC_*_LOCK_FREE macro should have for a type with
/// the specified properties.
-static const char *getLockFreeValue(unsigned TypeWidth, unsigned InlineWidth) {
+static const char *getLockFreeValue(unsigned TypeWidth, unsigned TypeAlign,
+ unsigned InlineWidth) {
// Fully-aligned, power-of-2 sizes no larger than the inline
// width will be inlined as lock-free operations.
- // Note: we do not need to check alignment since _Atomic(T) is always
- // appropriately-aligned in clang.
- if ((TypeWidth & (TypeWidth - 1)) == 0 && TypeWidth <= InlineWidth)
+ if (TypeWidth == TypeAlign && (TypeWidth & (TypeWidth - 1)) == 0 &&
+ TypeWidth <= InlineWidth)
return "2"; // "always lock free"
// We cannot be certain what operations the lib calls might be
// able to implement as lock-free on future processors.
return "1"; // "sometimes lock free"
}
/// \brief Add definitions required for a smooth interaction between
/// Objective-C++ automated reference counting and libstdc++ (4.2).
static void AddObjCXXARCLibstdcxxDefines(const LangOptions &LangOpts,
MacroBuilder &Builder) {
Builder.defineMacro("_GLIBCXX_PREDEFINED_OBJC_ARC_IS_SCALAR");
std::string Result;
{
// Provide specializations for the __is_scalar type trait so that
// lifetime-qualified objects are not considered "scalar" types, which
// libstdc++ uses as an indicator of the presence of trivial copy, assign,
// default-construct, and destruct semantics (none of which hold for
// lifetime-qualified objects in ARC).
llvm::raw_string_ostream Out(Result);
Out << "namespace std {\n"
<< "\n"
<< "struct __true_type;\n"
<< "struct __false_type;\n"
<< "\n";
Out << "template<typename _Tp> struct __is_scalar;\n"
<< "\n";
if (LangOpts.ObjCAutoRefCount) {
Out << "template<typename _Tp>\n"
<< "struct __is_scalar<__attribute__((objc_ownership(strong))) _Tp> {\n"
<< " enum { __value = 0 };\n"
<< " typedef __false_type __type;\n"
<< "};\n"
<< "\n";
}
if (LangOpts.ObjCWeak) {
Out << "template<typename _Tp>\n"
<< "struct __is_scalar<__attribute__((objc_ownership(weak))) _Tp> {\n"
<< " enum { __value = 0 };\n"
<< " typedef __false_type __type;\n"
<< "};\n"
<< "\n";
}
if (LangOpts.ObjCAutoRefCount) {
Out << "template<typename _Tp>\n"
<< "struct __is_scalar<__attribute__((objc_ownership(autoreleasing)))"
<< " _Tp> {\n"
<< " enum { __value = 0 };\n"
<< " typedef __false_type __type;\n"
<< "};\n"
<< "\n";
}
Out << "}\n";
}
Builder.append(Result);
}
static void InitializeStandardPredefinedMacros(const TargetInfo &TI,
const LangOptions &LangOpts,
const FrontendOptions &FEOpts,
MacroBuilder &Builder) {
if (!LangOpts.MSVCCompat && !LangOpts.TraditionalCPP)
Builder.defineMacro("__STDC__");
if (LangOpts.Freestanding)
Builder.defineMacro("__STDC_HOSTED__", "0");
else
Builder.defineMacro("__STDC_HOSTED__");
if (!LangOpts.CPlusPlus) {
if (LangOpts.C11)
Builder.defineMacro("__STDC_VERSION__", "201112L");
else if (LangOpts.C99)
Builder.defineMacro("__STDC_VERSION__", "199901L");
else if (!LangOpts.GNUMode && LangOpts.Digraphs)
Builder.defineMacro("__STDC_VERSION__", "199409L");
} else {
// FIXME: Use correct value for C++17.
if (LangOpts.CPlusPlus1z)
Builder.defineMacro("__cplusplus", "201406L");
// C++1y [cpp.predefined]p1:
// The name __cplusplus is defined to the value 201402L when compiling a
// C++ translation unit.
else if (LangOpts.CPlusPlus14)
Builder.defineMacro("__cplusplus", "201402L");
// C++11 [cpp.predefined]p1:
// The name __cplusplus is defined to the value 201103L when compiling a
// C++ translation unit.
else if (LangOpts.CPlusPlus11)
Builder.defineMacro("__cplusplus", "201103L");
// C++03 [cpp.predefined]p1:
// The name __cplusplus is defined to the value 199711L when compiling a
// C++ translation unit.
else
Builder.defineMacro("__cplusplus", "199711L");
// C++1z [cpp.predefined]p1:
// An integer literal of type std::size_t whose value is the alignment
// guaranteed by a call to operator new(std::size_t)
//
// We provide this in all language modes, since it seems generally useful.
Builder.defineMacro("__STDCPP_DEFAULT_NEW_ALIGNMENT__",
Twine(TI.getNewAlign() / TI.getCharWidth()) +
TI.getTypeConstantSuffix(TI.getSizeType()));
}
// In C11 these are environment macros. In C++11 they are only defined
// as part of <cuchar>. To prevent breakage when mixing C and C++
// code, define these macros unconditionally. We can define them
// unconditionally, as Clang always uses UTF-16 and UTF-32 for 16-bit
// and 32-bit character literals.
Builder.defineMacro("__STDC_UTF_16__", "1");
Builder.defineMacro("__STDC_UTF_32__", "1");
if (LangOpts.ObjC1)
Builder.defineMacro("__OBJC__");
// OpenCL v1.0/1.1 s6.9, v1.2/2.0 s6.10: Preprocessor Directives and Macros.
if (LangOpts.OpenCL) {
// OpenCL v1.0 and v1.1 do not have a predefined macro to indicate the
// language standard with which the program is compiled. __OPENCL_VERSION__
// is for the OpenCL version supported by the OpenCL device, which is not
// necessarily the language standard with which the program is compiled.
// A shared OpenCL header file requires a macro to indicate the language
// standard. As a workaround, __OPENCL_C_VERSION__ is defined for
// OpenCL v1.0 and v1.1.
switch (LangOpts.OpenCLVersion) {
case 100:
Builder.defineMacro("__OPENCL_C_VERSION__", "100");
break;
case 110:
Builder.defineMacro("__OPENCL_C_VERSION__", "110");
break;
case 120:
Builder.defineMacro("__OPENCL_C_VERSION__", "120");
break;
case 200:
Builder.defineMacro("__OPENCL_C_VERSION__", "200");
break;
default:
llvm_unreachable("Unsupported OpenCL version");
}
Builder.defineMacro("CL_VERSION_1_0", "100");
Builder.defineMacro("CL_VERSION_1_1", "110");
Builder.defineMacro("CL_VERSION_1_2", "120");
Builder.defineMacro("CL_VERSION_2_0", "200");
if (TI.isLittleEndian())
Builder.defineMacro("__ENDIAN_LITTLE__");
if (LangOpts.FastRelaxedMath)
Builder.defineMacro("__FAST_RELAXED_MATH__");
}
// Not "standard" per se, but available even with the -undef flag.
if (LangOpts.AsmPreprocessor)
Builder.defineMacro("__ASSEMBLER__");
if (LangOpts.CUDA)
Builder.defineMacro("__CUDA__");
}
/// Initialize the predefined C++ language feature test macros defined in
/// ISO/IEC JTC1/SC22/WG21 (C++) SD-6: "SG10 Feature Test Recommendations".
static void InitializeCPlusPlusFeatureTestMacros(const LangOptions &LangOpts,
MacroBuilder &Builder) {
// C++98 features.
if (LangOpts.RTTI)
Builder.defineMacro("__cpp_rtti", "199711");
if (LangOpts.CXXExceptions)
Builder.defineMacro("__cpp_exceptions", "199711");
// C++11 features.
if (LangOpts.CPlusPlus11) {
Builder.defineMacro("__cpp_unicode_characters", "200704");
Builder.defineMacro("__cpp_raw_strings", "200710");
Builder.defineMacro("__cpp_unicode_literals", "200710");
Builder.defineMacro("__cpp_user_defined_literals", "200809");
Builder.defineMacro("__cpp_lambdas", "200907");
Builder.defineMacro("__cpp_constexpr",
LangOpts.CPlusPlus14 ? "201304" : "200704");
Builder.defineMacro("__cpp_range_based_for",
LangOpts.CPlusPlus1z ? "201603" : "200907");
Builder.defineMacro("__cpp_static_assert",
LangOpts.CPlusPlus1z ? "201411" : "200410");
Builder.defineMacro("__cpp_decltype", "200707");
Builder.defineMacro("__cpp_attributes", "200809");
Builder.defineMacro("__cpp_rvalue_references", "200610");
Builder.defineMacro("__cpp_variadic_templates", "200704");
Builder.defineMacro("__cpp_initializer_lists", "200806");
Builder.defineMacro("__cpp_delegating_constructors", "200604");
Builder.defineMacro("__cpp_nsdmi", "200809");
Builder.defineMacro("__cpp_inheriting_constructors", "201511");
Builder.defineMacro("__cpp_ref_qualifiers", "200710");
Builder.defineMacro("__cpp_alias_templates", "200704");
}
// C++14 features.
if (LangOpts.CPlusPlus14) {
Builder.defineMacro("__cpp_binary_literals", "201304");
Builder.defineMacro("__cpp_digit_separators", "201309");
Builder.defineMacro("__cpp_init_captures", "201304");
Builder.defineMacro("__cpp_generic_lambdas", "201304");
Builder.defineMacro("__cpp_decltype_auto", "201304");
Builder.defineMacro("__cpp_return_type_deduction", "201304");
Builder.defineMacro("__cpp_aggregate_nsdmi", "201304");
Builder.defineMacro("__cpp_variable_templates", "201304");
}
if (LangOpts.SizedDeallocation)
Builder.defineMacro("__cpp_sized_deallocation", "201309");
// C++17 features.
if (LangOpts.CPlusPlus1z) {
Builder.defineMacro("__cpp_hex_float", "201603");
Builder.defineMacro("__cpp_inline_variables", "201606");
Builder.defineMacro("__cpp_noexcept_function_type", "201510");
Builder.defineMacro("__cpp_capture_star_this", "201603");
Builder.defineMacro("__cpp_if_constexpr", "201606");
Builder.defineMacro("__cpp_template_auto", "201606");
Builder.defineMacro("__cpp_namespace_attributes", "201411");
Builder.defineMacro("__cpp_enumerator_attributes", "201411");
Builder.defineMacro("__cpp_nested_namespace_definitions", "201411");
Builder.defineMacro("__cpp_variadic_using", "201611");
Builder.defineMacro("__cpp_aggregate_bases", "201603");
Builder.defineMacro("__cpp_structured_bindings", "201606");
Builder.defineMacro("__cpp_nontype_template_args", "201411");
Builder.defineMacro("__cpp_fold_expressions", "201603");
}
if (LangOpts.AlignedAllocation)
Builder.defineMacro("__cpp_aligned_new", "201606");
// TS features.
if (LangOpts.ConceptsTS)
Builder.defineMacro("__cpp_experimental_concepts", "1");
if (LangOpts.CoroutinesTS)
Builder.defineMacro("__cpp_coroutines", "1");
}
static void InitializePredefinedMacros(const TargetInfo &TI,
const LangOptions &LangOpts,
const FrontendOptions &FEOpts,
MacroBuilder &Builder) {
// Compiler version introspection macros.
Builder.defineMacro("__llvm__"); // LLVM Backend
Builder.defineMacro("__clang__"); // Clang Frontend
#define TOSTR2(X) #X
#define TOSTR(X) TOSTR2(X)
Builder.defineMacro("__clang_major__", TOSTR(CLANG_VERSION_MAJOR));
Builder.defineMacro("__clang_minor__", TOSTR(CLANG_VERSION_MINOR));
Builder.defineMacro("__clang_patchlevel__", TOSTR(CLANG_VERSION_PATCHLEVEL));
#undef TOSTR
#undef TOSTR2
Builder.defineMacro("__clang_version__",
"\"" CLANG_VERSION_STRING " "
+ getClangFullRepositoryVersion() + "\"");
if (!LangOpts.MSVCCompat) {
// Currently claim to be compatible with GCC 4.2.1-5621, but only if we're
// not compiling for MSVC compatibility
Builder.defineMacro("__GNUC_MINOR__", "2");
Builder.defineMacro("__GNUC_PATCHLEVEL__", "1");
Builder.defineMacro("__GNUC__", "4");
Builder.defineMacro("__GXX_ABI_VERSION", "1002");
}
// Define macros for the C11 / C++11 memory orderings
Builder.defineMacro("__ATOMIC_RELAXED", "0");
Builder.defineMacro("__ATOMIC_CONSUME", "1");
Builder.defineMacro("__ATOMIC_ACQUIRE", "2");
Builder.defineMacro("__ATOMIC_RELEASE", "3");
Builder.defineMacro("__ATOMIC_ACQ_REL", "4");
Builder.defineMacro("__ATOMIC_SEQ_CST", "5");
// Support for #pragma redefine_extname (Sun compatibility)
Builder.defineMacro("__PRAGMA_REDEFINE_EXTNAME", "1");
// As sad as it is, enough software depends on the __VERSION__ for version
// checks that it is necessary to report 4.2.1 (the base GCC version we claim
// compatibility with) first.
Builder.defineMacro("__VERSION__", "\"4.2.1 Compatible " +
Twine(getClangFullCPPVersion()) + "\"");
// Initialize language-specific preprocessor defines.
// Standard conforming mode?
if (!LangOpts.GNUMode && !LangOpts.MSVCCompat)
Builder.defineMacro("__STRICT_ANSI__");
if (!LangOpts.MSVCCompat && LangOpts.CPlusPlus11)
Builder.defineMacro("__GXX_EXPERIMENTAL_CXX0X__");
if (LangOpts.ObjC1) {
if (LangOpts.ObjCRuntime.isNonFragile()) {
Builder.defineMacro("__OBJC2__");
if (LangOpts.ObjCExceptions)
Builder.defineMacro("OBJC_ZEROCOST_EXCEPTIONS");
}
Builder.defineMacro("__OBJC_BOOL_IS_BOOL",
Twine(TI.useSignedCharForObjCBool() ? "0" : "1"));
if (LangOpts.getGC() != LangOptions::NonGC)
Builder.defineMacro("__OBJC_GC__");
if (LangOpts.ObjCRuntime.isNeXTFamily())
Builder.defineMacro("__NEXT_RUNTIME__");
if (LangOpts.ObjCRuntime.getKind() == ObjCRuntime::ObjFW) {
VersionTuple tuple = LangOpts.ObjCRuntime.getVersion();
unsigned minor = 0;
if (tuple.getMinor().hasValue())
minor = tuple.getMinor().getValue();
unsigned subminor = 0;
if (tuple.getSubminor().hasValue())
subminor = tuple.getSubminor().getValue();
Builder.defineMacro("__OBJFW_RUNTIME_ABI__",
Twine(tuple.getMajor() * 10000 + minor * 100 +
subminor));
}
Builder.defineMacro("IBOutlet", "__attribute__((iboutlet))");
Builder.defineMacro("IBOutletCollection(ClassName)",
"__attribute__((iboutletcollection(ClassName)))");
Builder.defineMacro("IBAction", "void)__attribute__((ibaction)");
Builder.defineMacro("IBInspectable", "");
Builder.defineMacro("IB_DESIGNABLE", "");
}
if (LangOpts.CPlusPlus)
InitializeCPlusPlusFeatureTestMacros(LangOpts, Builder);
// darwin_constant_cfstrings controls this. This is also dependent
// on other things like the runtime I believe. This is set even for C code.
if (!LangOpts.NoConstantCFStrings)
Builder.defineMacro("__CONSTANT_CFSTRINGS__");
if (LangOpts.ObjC2)
Builder.defineMacro("OBJC_NEW_PROPERTIES");
if (LangOpts.PascalStrings)
Builder.defineMacro("__PASCAL_STRINGS__");
if (LangOpts.Blocks) {
Builder.defineMacro("__block", "__attribute__((__blocks__(byref)))");
Builder.defineMacro("__BLOCKS__");
}
if (!LangOpts.MSVCCompat && LangOpts.Exceptions)
Builder.defineMacro("__EXCEPTIONS");
if (!LangOpts.MSVCCompat && LangOpts.RTTI)
Builder.defineMacro("__GXX_RTTI");
if (LangOpts.SjLjExceptions)
Builder.defineMacro("__USING_SJLJ_EXCEPTIONS__");
if (LangOpts.Deprecated)
Builder.defineMacro("__DEPRECATED");
if (!LangOpts.MSVCCompat && LangOpts.CPlusPlus) {
Builder.defineMacro("__GNUG__", "4");
Builder.defineMacro("__GXX_WEAK__");
Builder.defineMacro("__private_extern__", "extern");
}
if (LangOpts.MicrosoftExt) {
if (LangOpts.WChar) {
// wchar_t supported as a keyword.
Builder.defineMacro("_WCHAR_T_DEFINED");
Builder.defineMacro("_NATIVE_WCHAR_T_DEFINED");
}
}
if (LangOpts.Optimize)
Builder.defineMacro("__OPTIMIZE__");
if (LangOpts.OptimizeSize)
Builder.defineMacro("__OPTIMIZE_SIZE__");
if (LangOpts.FastMath)
Builder.defineMacro("__FAST_MATH__");
// Initialize target-specific preprocessor defines.
// __BYTE_ORDER__ was added in GCC 4.6. It's analogous
// to the macro __BYTE_ORDER (no trailing underscores)
// from glibc's <endian.h> header.
// We don't support the PDP-11 as a target, but include
// the define so it can still be compared against.
Builder.defineMacro("__ORDER_LITTLE_ENDIAN__", "1234");
Builder.defineMacro("__ORDER_BIG_ENDIAN__", "4321");
Builder.defineMacro("__ORDER_PDP_ENDIAN__", "3412");
if (TI.isBigEndian()) {
Builder.defineMacro("__BYTE_ORDER__", "__ORDER_BIG_ENDIAN__");
Builder.defineMacro("__BIG_ENDIAN__");
} else {
Builder.defineMacro("__BYTE_ORDER__", "__ORDER_LITTLE_ENDIAN__");
Builder.defineMacro("__LITTLE_ENDIAN__");
}
if (TI.getPointerWidth(0) == 64 && TI.getLongWidth() == 64
&& TI.getIntWidth() == 32) {
Builder.defineMacro("_LP64");
Builder.defineMacro("__LP64__");
}
if (TI.getPointerWidth(0) == 32 && TI.getLongWidth() == 32
&& TI.getIntWidth() == 32) {
Builder.defineMacro("_ILP32");
Builder.defineMacro("__ILP32__");
}
// Define type sizing macros based on the target properties.
assert(TI.getCharWidth() == 8 && "Only support 8-bit char so far");
Builder.defineMacro("__CHAR_BIT__", Twine(TI.getCharWidth()));
DefineTypeSize("__SCHAR_MAX__", TargetInfo::SignedChar, TI, Builder);
DefineTypeSize("__SHRT_MAX__", TargetInfo::SignedShort, TI, Builder);
DefineTypeSize("__INT_MAX__", TargetInfo::SignedInt, TI, Builder);
DefineTypeSize("__LONG_MAX__", TargetInfo::SignedLong, TI, Builder);
DefineTypeSize("__LONG_LONG_MAX__", TargetInfo::SignedLongLong, TI, Builder);
DefineTypeSize("__WCHAR_MAX__", TI.getWCharType(), TI, Builder);
DefineTypeSize("__INTMAX_MAX__", TI.getIntMaxType(), TI, Builder);
DefineTypeSize("__SIZE_MAX__", TI.getSizeType(), TI, Builder);
DefineTypeSize("__UINTMAX_MAX__", TI.getUIntMaxType(), TI, Builder);
DefineTypeSize("__PTRDIFF_MAX__", TI.getPtrDiffType(0), TI, Builder);
DefineTypeSize("__INTPTR_MAX__", TI.getIntPtrType(), TI, Builder);
DefineTypeSize("__UINTPTR_MAX__", TI.getUIntPtrType(), TI, Builder);
DefineTypeSizeof("__SIZEOF_DOUBLE__", TI.getDoubleWidth(), TI, Builder);
DefineTypeSizeof("__SIZEOF_FLOAT__", TI.getFloatWidth(), TI, Builder);
DefineTypeSizeof("__SIZEOF_INT__", TI.getIntWidth(), TI, Builder);
DefineTypeSizeof("__SIZEOF_LONG__", TI.getLongWidth(), TI, Builder);
DefineTypeSizeof("__SIZEOF_LONG_DOUBLE__",TI.getLongDoubleWidth(),TI,Builder);
DefineTypeSizeof("__SIZEOF_LONG_LONG__", TI.getLongLongWidth(), TI, Builder);
DefineTypeSizeof("__SIZEOF_POINTER__", TI.getPointerWidth(0), TI, Builder);
DefineTypeSizeof("__SIZEOF_SHORT__", TI.getShortWidth(), TI, Builder);
DefineTypeSizeof("__SIZEOF_PTRDIFF_T__",
TI.getTypeWidth(TI.getPtrDiffType(0)), TI, Builder);
DefineTypeSizeof("__SIZEOF_SIZE_T__",
TI.getTypeWidth(TI.getSizeType()), TI, Builder);
DefineTypeSizeof("__SIZEOF_WCHAR_T__",
TI.getTypeWidth(TI.getWCharType()), TI, Builder);
DefineTypeSizeof("__SIZEOF_WINT_T__",
TI.getTypeWidth(TI.getWIntType()), TI, Builder);
if (TI.hasInt128Type())
DefineTypeSizeof("__SIZEOF_INT128__", 128, TI, Builder);
DefineType("__INTMAX_TYPE__", TI.getIntMaxType(), Builder);
DefineFmt("__INTMAX", TI.getIntMaxType(), TI, Builder);
Builder.defineMacro("__INTMAX_C_SUFFIX__",
TI.getTypeConstantSuffix(TI.getIntMaxType()));
DefineType("__UINTMAX_TYPE__", TI.getUIntMaxType(), Builder);
DefineFmt("__UINTMAX", TI.getUIntMaxType(), TI, Builder);
Builder.defineMacro("__UINTMAX_C_SUFFIX__",
TI.getTypeConstantSuffix(TI.getUIntMaxType()));
DefineTypeWidth("__INTMAX_WIDTH__", TI.getIntMaxType(), TI, Builder);
DefineType("__PTRDIFF_TYPE__", TI.getPtrDiffType(0), Builder);
DefineFmt("__PTRDIFF", TI.getPtrDiffType(0), TI, Builder);
DefineTypeWidth("__PTRDIFF_WIDTH__", TI.getPtrDiffType(0), TI, Builder);
DefineType("__INTPTR_TYPE__", TI.getIntPtrType(), Builder);
DefineFmt("__INTPTR", TI.getIntPtrType(), TI, Builder);
DefineTypeWidth("__INTPTR_WIDTH__", TI.getIntPtrType(), TI, Builder);
DefineType("__SIZE_TYPE__", TI.getSizeType(), Builder);
DefineFmt("__SIZE", TI.getSizeType(), TI, Builder);
DefineTypeWidth("__SIZE_WIDTH__", TI.getSizeType(), TI, Builder);
DefineType("__WCHAR_TYPE__", TI.getWCharType(), Builder);
DefineTypeWidth("__WCHAR_WIDTH__", TI.getWCharType(), TI, Builder);
DefineType("__WINT_TYPE__", TI.getWIntType(), Builder);
DefineTypeWidth("__WINT_WIDTH__", TI.getWIntType(), TI, Builder);
DefineTypeWidth("__SIG_ATOMIC_WIDTH__", TI.getSigAtomicType(), TI, Builder);
DefineTypeSize("__SIG_ATOMIC_MAX__", TI.getSigAtomicType(), TI, Builder);
DefineType("__CHAR16_TYPE__", TI.getChar16Type(), Builder);
DefineType("__CHAR32_TYPE__", TI.getChar32Type(), Builder);
DefineTypeWidth("__UINTMAX_WIDTH__", TI.getUIntMaxType(), TI, Builder);
DefineType("__UINTPTR_TYPE__", TI.getUIntPtrType(), Builder);
DefineFmt("__UINTPTR", TI.getUIntPtrType(), TI, Builder);
DefineTypeWidth("__UINTPTR_WIDTH__", TI.getUIntPtrType(), TI, Builder);
DefineFloatMacros(Builder, "FLT", &TI.getFloatFormat(), "F");
DefineFloatMacros(Builder, "DBL", &TI.getDoubleFormat(), "");
DefineFloatMacros(Builder, "LDBL", &TI.getLongDoubleFormat(), "L");
// Define a __POINTER_WIDTH__ macro for stdint.h.
Builder.defineMacro("__POINTER_WIDTH__",
Twine((int)TI.getPointerWidth(0)));
// Define __BIGGEST_ALIGNMENT__ to be compatible with gcc.
Builder.defineMacro("__BIGGEST_ALIGNMENT__",
Twine(TI.getSuitableAlign() / TI.getCharWidth()) );
if (!LangOpts.CharIsSigned)
Builder.defineMacro("__CHAR_UNSIGNED__");
if (!TargetInfo::isTypeSigned(TI.getWCharType()))
Builder.defineMacro("__WCHAR_UNSIGNED__");
if (!TargetInfo::isTypeSigned(TI.getWIntType()))
Builder.defineMacro("__WINT_UNSIGNED__");
// Define exact-width integer types for stdint.h
DefineExactWidthIntType(TargetInfo::SignedChar, TI, Builder);
if (TI.getShortWidth() > TI.getCharWidth())
DefineExactWidthIntType(TargetInfo::SignedShort, TI, Builder);
if (TI.getIntWidth() > TI.getShortWidth())
DefineExactWidthIntType(TargetInfo::SignedInt, TI, Builder);
if (TI.getLongWidth() > TI.getIntWidth())
DefineExactWidthIntType(TargetInfo::SignedLong, TI, Builder);
if (TI.getLongLongWidth() > TI.getLongWidth())
DefineExactWidthIntType(TargetInfo::SignedLongLong, TI, Builder);
DefineExactWidthIntType(TargetInfo::UnsignedChar, TI, Builder);
DefineExactWidthIntTypeSize(TargetInfo::UnsignedChar, TI, Builder);
DefineExactWidthIntTypeSize(TargetInfo::SignedChar, TI, Builder);
if (TI.getShortWidth() > TI.getCharWidth()) {
DefineExactWidthIntType(TargetInfo::UnsignedShort, TI, Builder);
DefineExactWidthIntTypeSize(TargetInfo::UnsignedShort, TI, Builder);
DefineExactWidthIntTypeSize(TargetInfo::SignedShort, TI, Builder);
}
if (TI.getIntWidth() > TI.getShortWidth()) {
DefineExactWidthIntType(TargetInfo::UnsignedInt, TI, Builder);
DefineExactWidthIntTypeSize(TargetInfo::UnsignedInt, TI, Builder);
DefineExactWidthIntTypeSize(TargetInfo::SignedInt, TI, Builder);
}
if (TI.getLongWidth() > TI.getIntWidth()) {
DefineExactWidthIntType(TargetInfo::UnsignedLong, TI, Builder);
DefineExactWidthIntTypeSize(TargetInfo::UnsignedLong, TI, Builder);
DefineExactWidthIntTypeSize(TargetInfo::SignedLong, TI, Builder);
}
if (TI.getLongLongWidth() > TI.getLongWidth()) {
DefineExactWidthIntType(TargetInfo::UnsignedLongLong, TI, Builder);
DefineExactWidthIntTypeSize(TargetInfo::UnsignedLongLong, TI, Builder);
DefineExactWidthIntTypeSize(TargetInfo::SignedLongLong, TI, Builder);
}
DefineLeastWidthIntType(8, true, TI, Builder);
DefineLeastWidthIntType(8, false, TI, Builder);
DefineLeastWidthIntType(16, true, TI, Builder);
DefineLeastWidthIntType(16, false, TI, Builder);
DefineLeastWidthIntType(32, true, TI, Builder);
DefineLeastWidthIntType(32, false, TI, Builder);
DefineLeastWidthIntType(64, true, TI, Builder);
DefineLeastWidthIntType(64, false, TI, Builder);
DefineFastIntType(8, true, TI, Builder);
DefineFastIntType(8, false, TI, Builder);
DefineFastIntType(16, true, TI, Builder);
DefineFastIntType(16, false, TI, Builder);
DefineFastIntType(32, true, TI, Builder);
DefineFastIntType(32, false, TI, Builder);
DefineFastIntType(64, true, TI, Builder);
DefineFastIntType(64, false, TI, Builder);
char UserLabelPrefix[2] = {TI.getDataLayout().getGlobalPrefix(), 0};
Builder.defineMacro("__USER_LABEL_PREFIX__", UserLabelPrefix);
if (LangOpts.FastMath || LangOpts.FiniteMathOnly)
Builder.defineMacro("__FINITE_MATH_ONLY__", "1");
else
Builder.defineMacro("__FINITE_MATH_ONLY__", "0");
if (!LangOpts.MSVCCompat) {
if (LangOpts.GNUInline || LangOpts.CPlusPlus)
Builder.defineMacro("__GNUC_GNU_INLINE__");
else
Builder.defineMacro("__GNUC_STDC_INLINE__");
// The value written by __atomic_test_and_set.
// FIXME: This is target-dependent.
Builder.defineMacro("__GCC_ATOMIC_TEST_AND_SET_TRUEVAL", "1");
// Used by libc++ and libstdc++ to implement ATOMIC_<foo>_LOCK_FREE.
unsigned InlineWidthBits = TI.getMaxAtomicInlineWidth();
#define DEFINE_LOCK_FREE_MACRO(TYPE, Type) \
Builder.defineMacro("__GCC_ATOMIC_" #TYPE "_LOCK_FREE", \
getLockFreeValue(TI.get##Type##Width(), \
+ TI.get##Type##Align(), \
InlineWidthBits));
DEFINE_LOCK_FREE_MACRO(BOOL, Bool);
DEFINE_LOCK_FREE_MACRO(CHAR, Char);
DEFINE_LOCK_FREE_MACRO(CHAR16_T, Char16);
DEFINE_LOCK_FREE_MACRO(CHAR32_T, Char32);
DEFINE_LOCK_FREE_MACRO(WCHAR_T, WChar);
DEFINE_LOCK_FREE_MACRO(SHORT, Short);
DEFINE_LOCK_FREE_MACRO(INT, Int);
DEFINE_LOCK_FREE_MACRO(LONG, Long);
DEFINE_LOCK_FREE_MACRO(LLONG, LongLong);
Builder.defineMacro("__GCC_ATOMIC_POINTER_LOCK_FREE",
getLockFreeValue(TI.getPointerWidth(0),
+ TI.getPointerAlign(0),
InlineWidthBits));
#undef DEFINE_LOCK_FREE_MACRO
}
if (LangOpts.NoInlineDefine)
Builder.defineMacro("__NO_INLINE__");
if (unsigned PICLevel = LangOpts.PICLevel) {
Builder.defineMacro("__PIC__", Twine(PICLevel));
Builder.defineMacro("__pic__", Twine(PICLevel));
if (LangOpts.PIE) {
Builder.defineMacro("__PIE__", Twine(PICLevel));
Builder.defineMacro("__pie__", Twine(PICLevel));
}
}
// Macros to control C99 numerics and <float.h>
Builder.defineMacro("__FLT_EVAL_METHOD__", Twine(TI.getFloatEvalMethod()));
Builder.defineMacro("__FLT_RADIX__", "2");
Builder.defineMacro("__DECIMAL_DIG__", "__LDBL_DECIMAL_DIG__");
if (LangOpts.getStackProtector() == LangOptions::SSPOn)
Builder.defineMacro("__SSP__");
else if (LangOpts.getStackProtector() == LangOptions::SSPStrong)
Builder.defineMacro("__SSP_STRONG__", "2");
else if (LangOpts.getStackProtector() == LangOptions::SSPReq)
Builder.defineMacro("__SSP_ALL__", "3");
// Define a macro that exists only when using the static analyzer.
if (FEOpts.ProgramAction == frontend::RunAnalysis)
Builder.defineMacro("__clang_analyzer__");
if (LangOpts.FastRelaxedMath)
Builder.defineMacro("__FAST_RELAXED_MATH__");
if (FEOpts.ProgramAction == frontend::RewriteObjC ||
LangOpts.getGC() != LangOptions::NonGC) {
Builder.defineMacro("__weak", "__attribute__((objc_gc(weak)))");
Builder.defineMacro("__strong", "__attribute__((objc_gc(strong)))");
Builder.defineMacro("__autoreleasing", "");
Builder.defineMacro("__unsafe_unretained", "");
} else if (LangOpts.ObjC1) {
Builder.defineMacro("__weak", "__attribute__((objc_ownership(weak)))");
Builder.defineMacro("__strong", "__attribute__((objc_ownership(strong)))");
Builder.defineMacro("__autoreleasing",
"__attribute__((objc_ownership(autoreleasing)))");
Builder.defineMacro("__unsafe_unretained",
"__attribute__((objc_ownership(none)))");
}
// On Darwin, there are __double_underscored variants of the type
// nullability qualifiers.
if (TI.getTriple().isOSDarwin()) {
Builder.defineMacro("__nonnull", "_Nonnull");
Builder.defineMacro("__null_unspecified", "_Null_unspecified");
Builder.defineMacro("__nullable", "_Nullable");
}
// OpenMP definition
// OpenMP 2.2:
// In implementations that support a preprocessor, the _OPENMP
// macro name is defined to have the decimal value yyyymm where
// yyyy and mm are the year and the month designations of the
// version of the OpenMP API that the implementation support.
switch (LangOpts.OpenMP) {
case 0:
break;
case 40:
Builder.defineMacro("_OPENMP", "201307");
break;
case 45:
Builder.defineMacro("_OPENMP", "201511");
break;
default:
// Default version is OpenMP 3.1
Builder.defineMacro("_OPENMP", "201107");
break;
}
// CUDA device path compilaton
if (LangOpts.CUDAIsDevice) {
// The CUDA_ARCH value is set for the GPU target specified in the NVPTX
// backend's target defines.
Builder.defineMacro("__CUDA_ARCH__");
}
// We need to communicate this to our CUDA header wrapper, which in turn
// informs the proper CUDA headers of this choice.
if (LangOpts.CUDADeviceApproxTranscendentals || LangOpts.FastMath) {
Builder.defineMacro("__CLANG_CUDA_APPROX_TRANSCENDENTALS__");
}
// OpenCL definitions.
if (LangOpts.OpenCL) {
#define OPENCLEXT(Ext) \
if (TI.getSupportedOpenCLOpts().isSupported(#Ext, \
LangOpts.OpenCLVersion)) \
Builder.defineMacro(#Ext);
#include "clang/Basic/OpenCLExtensions.def"
}
if (TI.hasInt128Type() && LangOpts.CPlusPlus && LangOpts.GNUMode) {
// For each extended integer type, g++ defines a macro mapping the
// index of the type (0 in this case) in some list of extended types
// to the type.
Builder.defineMacro("__GLIBCXX_TYPE_INT_N_0", "__int128");
Builder.defineMacro("__GLIBCXX_BITSIZE_INT_N_0", "128");
}
// Get other target #defines.
TI.getTargetDefines(LangOpts, Builder);
}
/// InitializePreprocessor - Initialize the preprocessor getting it and the
/// environment ready to process a single file. This returns true on error.
///
void clang::InitializePreprocessor(
Preprocessor &PP, const PreprocessorOptions &InitOpts,
const PCHContainerReader &PCHContainerRdr,
const FrontendOptions &FEOpts) {
const LangOptions &LangOpts = PP.getLangOpts();
std::string PredefineBuffer;
PredefineBuffer.reserve(4080);
llvm::raw_string_ostream Predefines(PredefineBuffer);
MacroBuilder Builder(Predefines);
// Emit line markers for various builtin sections of the file. We don't do
// this in asm preprocessor mode, because "# 4" is not a line marker directive
// in this mode.
if (!PP.getLangOpts().AsmPreprocessor)
Builder.append("# 1 \"<built-in>\" 3");
// Install things like __POWERPC__, __GNUC__, etc into the macro table.
if (InitOpts.UsePredefines) {
if (LangOpts.CUDA && PP.getAuxTargetInfo())
InitializePredefinedMacros(*PP.getAuxTargetInfo(), LangOpts, FEOpts,
Builder);
InitializePredefinedMacros(PP.getTargetInfo(), LangOpts, FEOpts, Builder);
// Install definitions to make Objective-C++ ARC work well with various
// C++ Standard Library implementations.
if (LangOpts.ObjC1 && LangOpts.CPlusPlus &&
(LangOpts.ObjCAutoRefCount || LangOpts.ObjCWeak)) {
switch (InitOpts.ObjCXXARCStandardLibrary) {
case ARCXX_nolib:
case ARCXX_libcxx:
break;
case ARCXX_libstdcxx:
AddObjCXXARCLibstdcxxDefines(LangOpts, Builder);
break;
}
}
}
// Even with predefines off, some macros are still predefined.
// These should all be defined in the preprocessor according to the
// current language configuration.
InitializeStandardPredefinedMacros(PP.getTargetInfo(), PP.getLangOpts(),
FEOpts, Builder);
// Add on the predefines from the driver. Wrap in a #line directive to report
// that they come from the command line.
if (!PP.getLangOpts().AsmPreprocessor)
Builder.append("# 1 \"<command line>\" 1");
// Process #define's and #undef's in the order they are given.
for (unsigned i = 0, e = InitOpts.Macros.size(); i != e; ++i) {
if (InitOpts.Macros[i].second) // isUndef
Builder.undefineMacro(InitOpts.Macros[i].first);
else
DefineBuiltinMacro(Builder, InitOpts.Macros[i].first,
PP.getDiagnostics());
}
// Exit the command line and go back to <built-in> (2 is LC_LEAVE).
if (!PP.getLangOpts().AsmPreprocessor)
Builder.append("# 1 \"<built-in>\" 2");
// If -imacros are specified, include them now. These are processed before
// any -include directives.
for (unsigned i = 0, e = InitOpts.MacroIncludes.size(); i != e; ++i)
AddImplicitIncludeMacros(Builder, InitOpts.MacroIncludes[i]);
// Process -include-pch/-include-pth directives.
if (!InitOpts.ImplicitPCHInclude.empty())
AddImplicitIncludePCH(Builder, PP, PCHContainerRdr,
InitOpts.ImplicitPCHInclude);
if (!InitOpts.ImplicitPTHInclude.empty())
AddImplicitIncludePTH(Builder, PP, InitOpts.ImplicitPTHInclude);
// Process -include directives.
for (unsigned i = 0, e = InitOpts.Includes.size(); i != e; ++i) {
const std::string &Path = InitOpts.Includes[i];
AddImplicitInclude(Builder, Path);
}
// Instruct the preprocessor to skip the preamble.
PP.setSkipMainFilePreamble(InitOpts.PrecompiledPreambleBytes.first,
InitOpts.PrecompiledPreambleBytes.second);
// Copy PredefinedBuffer into the Preprocessor.
PP.setPredefines(Predefines.str());
}
Index: projects/clang400-import/contrib/llvm/tools/clang/lib/StaticAnalyzer/Checkers/VirtualCallChecker.cpp
===================================================================
--- projects/clang400-import/contrib/llvm/tools/clang/lib/StaticAnalyzer/Checkers/VirtualCallChecker.cpp (revision 314268)
+++ projects/clang400-import/contrib/llvm/tools/clang/lib/StaticAnalyzer/Checkers/VirtualCallChecker.cpp (revision 314269)
@@ -1,291 +1,292 @@
//=======- VirtualCallChecker.cpp --------------------------------*- C++ -*-==//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
// This file defines a checker that checks virtual function calls during
// construction or destruction of C++ objects.
//
//===----------------------------------------------------------------------===//
#include "ClangSACheckers.h"
#include "clang/AST/DeclCXX.h"
#include "clang/AST/StmtVisitor.h"
#include "clang/StaticAnalyzer/Core/BugReporter/BugReporter.h"
#include "clang/StaticAnalyzer/Core/Checker.h"
#include "clang/StaticAnalyzer/Core/PathSensitive/AnalysisManager.h"
#include "llvm/ADT/SmallString.h"
#include "llvm/Support/SaveAndRestore.h"
#include "llvm/Support/raw_ostream.h"
using namespace clang;
using namespace ento;
namespace {
class WalkAST : public StmtVisitor<WalkAST> {
const CheckerBase *Checker;
BugReporter &BR;
AnalysisDeclContext *AC;
/// The root constructor or destructor whose callees are being analyzed.
const CXXMethodDecl *RootMethod = nullptr;
/// Whether the checker should walk into bodies of called functions.
/// Controlled by the "Interprocedural" analyzer-config option.
bool IsInterprocedural = false;
/// Whether the checker should only warn for calls to pure virtual functions
/// (which is undefined behavior) or for all virtual functions (which may
/// may result in unexpected behavior).
bool ReportPureOnly = false;
typedef const CallExpr * WorkListUnit;
typedef SmallVector<WorkListUnit, 20> DFSWorkList;
/// A vector representing the worklist which has a chain of CallExprs.
DFSWorkList WList;
// PreVisited : A CallExpr to this FunctionDecl is in the worklist, but the
// body has not been visited yet.
// PostVisited : A CallExpr to this FunctionDecl is in the worklist, and the
// body has been visited.
enum Kind { NotVisited,
PreVisited, /**< A CallExpr to this FunctionDecl is in the
worklist, but the body has not yet been
visited. */
PostVisited /**< A CallExpr to this FunctionDecl is in the
worklist, and the body has been visited. */
};
/// A DenseMap that records visited states of FunctionDecls.
llvm::DenseMap<const FunctionDecl *, Kind> VisitedFunctions;
/// The CallExpr whose body is currently being visited. This is used for
/// generating bug reports. This is null while visiting the body of a
/// constructor or destructor.
const CallExpr *visitingCallExpr;
public:
WalkAST(const CheckerBase *checker, BugReporter &br, AnalysisDeclContext *ac,
const CXXMethodDecl *rootMethod, bool isInterprocedural,
bool reportPureOnly)
: Checker(checker), BR(br), AC(ac), RootMethod(rootMethod),
IsInterprocedural(isInterprocedural), ReportPureOnly(reportPureOnly),
visitingCallExpr(nullptr) {
// Walking should always start from either a constructor or a destructor.
assert(isa<CXXConstructorDecl>(rootMethod) ||
isa<CXXDestructorDecl>(rootMethod));
}
bool hasWork() const { return !WList.empty(); }
/// This method adds a CallExpr to the worklist and marks the callee as
/// being PreVisited.
void Enqueue(WorkListUnit WLUnit) {
const FunctionDecl *FD = WLUnit->getDirectCallee();
if (!FD || !FD->getBody())
return;
Kind &K = VisitedFunctions[FD];
if (K != NotVisited)
return;
K = PreVisited;
WList.push_back(WLUnit);
}
/// This method returns an item from the worklist without removing it.
WorkListUnit Dequeue() {
assert(!WList.empty());
return WList.back();
}
void Execute() {
while (hasWork()) {
WorkListUnit WLUnit = Dequeue();
const FunctionDecl *FD = WLUnit->getDirectCallee();
assert(FD && FD->getBody());
if (VisitedFunctions[FD] == PreVisited) {
// If the callee is PreVisited, walk its body.
// Visit the body.
SaveAndRestore<const CallExpr *> SaveCall(visitingCallExpr, WLUnit);
Visit(FD->getBody());
// Mark the function as being PostVisited to indicate we have
// scanned the body.
VisitedFunctions[FD] = PostVisited;
continue;
}
// Otherwise, the callee is PostVisited.
// Remove it from the worklist.
assert(VisitedFunctions[FD] == PostVisited);
WList.pop_back();
}
}
// Stmt visitor methods.
void VisitCallExpr(CallExpr *CE);
void VisitCXXMemberCallExpr(CallExpr *CE);
void VisitStmt(Stmt *S) { VisitChildren(S); }
void VisitChildren(Stmt *S);
void ReportVirtualCall(const CallExpr *CE, bool isPure);
};
} // end anonymous namespace
//===----------------------------------------------------------------------===//
// AST walking.
//===----------------------------------------------------------------------===//
void WalkAST::VisitChildren(Stmt *S) {
for (Stmt *Child : S->children())
if (Child)
Visit(Child);
}
void WalkAST::VisitCallExpr(CallExpr *CE) {
VisitChildren(CE);
if (IsInterprocedural)
Enqueue(CE);
}
void WalkAST::VisitCXXMemberCallExpr(CallExpr *CE) {
VisitChildren(CE);
bool callIsNonVirtual = false;
// Several situations to elide for checking.
if (MemberExpr *CME = dyn_cast<MemberExpr>(CE->getCallee())) {
// If the member access is fully qualified (i.e., X::F), then treat
// this as a non-virtual call and do not warn.
if (CME->getQualifier())
callIsNonVirtual = true;
if (Expr *base = CME->getBase()->IgnoreImpCasts()) {
// Elide analyzing the call entirely if the base pointer is not 'this'.
if (!isa<CXXThisExpr>(base))
return;
// If the most derived class is marked final, we know that now subclass
// can override this member.
if (base->getBestDynamicClassType()->hasAttr<FinalAttr>())
callIsNonVirtual = true;
}
}
// Get the callee.
- const CXXMethodDecl *MD = dyn_cast<CXXMethodDecl>(CE->getDirectCallee());
+ const CXXMethodDecl *MD =
+ dyn_cast_or_null<CXXMethodDecl>(CE->getDirectCallee());
if (MD && MD->isVirtual() && !callIsNonVirtual && !MD->hasAttr<FinalAttr>() &&
!MD->getParent()->hasAttr<FinalAttr>())
ReportVirtualCall(CE, MD->isPure());
if (IsInterprocedural)
Enqueue(CE);
}
void WalkAST::ReportVirtualCall(const CallExpr *CE, bool isPure) {
if (ReportPureOnly && !isPure)
return;
SmallString<100> buf;
llvm::raw_svector_ostream os(buf);
// FIXME: The interprocedural diagnostic experience here is not good.
// Ultimately this checker should be re-written to be path sensitive.
// For now, only diagnose intraprocedurally, by default.
if (IsInterprocedural) {
os << "Call Path : ";
// Name of current visiting CallExpr.
os << *CE->getDirectCallee();
// Name of the CallExpr whose body is current being walked.
if (visitingCallExpr)
os << " <-- " << *visitingCallExpr->getDirectCallee();
// Names of FunctionDecls in worklist with state PostVisited.
for (SmallVectorImpl<const CallExpr *>::iterator I = WList.end(),
E = WList.begin(); I != E; --I) {
const FunctionDecl *FD = (*(I-1))->getDirectCallee();
assert(FD);
if (VisitedFunctions[FD] == PostVisited)
os << " <-- " << *FD;
}
os << "\n";
}
PathDiagnosticLocation CELoc =
PathDiagnosticLocation::createBegin(CE, BR.getSourceManager(), AC);
SourceRange R = CE->getCallee()->getSourceRange();
os << "Call to ";
if (isPure)
os << "pure ";
os << "virtual function during ";
if (isa<CXXConstructorDecl>(RootMethod))
os << "construction ";
else
os << "destruction ";
if (isPure)
os << "has undefined behavior";
else
os << "will not dispatch to derived class";
BR.EmitBasicReport(AC->getDecl(), Checker,
"Call to virtual function during construction or "
"destruction",
"C++ Object Lifecycle", os.str(), CELoc, R);
}
//===----------------------------------------------------------------------===//
// VirtualCallChecker
//===----------------------------------------------------------------------===//
namespace {
class VirtualCallChecker : public Checker<check::ASTDecl<CXXRecordDecl> > {
public:
DefaultBool isInterprocedural;
DefaultBool isPureOnly;
void checkASTDecl(const CXXRecordDecl *RD, AnalysisManager& mgr,
BugReporter &BR) const {
AnalysisDeclContext *ADC = mgr.getAnalysisDeclContext(RD);
// Check the constructors.
for (const auto *I : RD->ctors()) {
if (!I->isCopyOrMoveConstructor())
if (Stmt *Body = I->getBody()) {
WalkAST walker(this, BR, ADC, I, isInterprocedural, isPureOnly);
walker.Visit(Body);
walker.Execute();
}
}
// Check the destructor.
if (CXXDestructorDecl *DD = RD->getDestructor())
if (Stmt *Body = DD->getBody()) {
WalkAST walker(this, BR, ADC, DD, isInterprocedural, isPureOnly);
walker.Visit(Body);
walker.Execute();
}
}
};
}
void ento::registerVirtualCallChecker(CheckerManager &mgr) {
VirtualCallChecker *checker = mgr.registerChecker<VirtualCallChecker>();
checker->isInterprocedural =
mgr.getAnalyzerOptions().getBooleanOption("Interprocedural", false,
checker);
checker->isPureOnly =
mgr.getAnalyzerOptions().getBooleanOption("PureOnly", false,
checker);
}
Index: projects/clang400-import/contrib/llvm/tools/clang
===================================================================
--- projects/clang400-import/contrib/llvm/tools/clang (revision 314268)
+++ projects/clang400-import/contrib/llvm/tools/clang (revision 314269)
Property changes on: projects/clang400-import/contrib/llvm/tools/clang
___________________________________________________________________
Modified: svn:mergeinfo
## -0,0 +0,1 ##
Merged /vendor/clang/dist:r314176-314267
Index: projects/clang400-import/contrib/llvm/tools/lld
===================================================================
--- projects/clang400-import/contrib/llvm/tools/lld (revision 314268)
+++ projects/clang400-import/contrib/llvm/tools/lld (revision 314269)
Property changes on: projects/clang400-import/contrib/llvm/tools/lld
___________________________________________________________________
Modified: svn:mergeinfo
## -0,0 +0,1 ##
Merged /vendor/lld/dist:r314176-314268
Index: projects/clang400-import/contrib/llvm/tools/lldb
===================================================================
--- projects/clang400-import/contrib/llvm/tools/lldb (revision 314268)
+++ projects/clang400-import/contrib/llvm/tools/lldb (revision 314269)
Property changes on: projects/clang400-import/contrib/llvm/tools/lldb
___________________________________________________________________
Modified: svn:mergeinfo
## -0,0 +0,1 ##
Merged /vendor/lldb/dist:r314176-314268
Index: projects/clang400-import/contrib/llvm
===================================================================
--- projects/clang400-import/contrib/llvm (revision 314268)
+++ projects/clang400-import/contrib/llvm (revision 314269)
Property changes on: projects/clang400-import/contrib/llvm
___________________________________________________________________
Modified: svn:mergeinfo
## -0,0 +0,1 ##
Merged /vendor/llvm/dist:r314176-314267
Index: projects/clang400-import/lib/clang/include/clang/Basic/Version.inc
===================================================================
--- projects/clang400-import/lib/clang/include/clang/Basic/Version.inc (revision 314268)
+++ projects/clang400-import/lib/clang/include/clang/Basic/Version.inc (revision 314269)
@@ -1,11 +1,11 @@
/* $FreeBSD$ */
#define CLANG_VERSION 4.0.0
#define CLANG_VERSION_STRING "4.0.0"
#define CLANG_VERSION_MAJOR 4
#define CLANG_VERSION_MINOR 0
#define CLANG_VERSION_PATCHLEVEL 0
#define CLANG_VENDOR "FreeBSD "
-#define SVN_REVISION "296002"
+#define SVN_REVISION "296202"
Index: projects/clang400-import/lib/clang/include/lld/Config/Version.inc
===================================================================
--- projects/clang400-import/lib/clang/include/lld/Config/Version.inc (revision 314268)
+++ projects/clang400-import/lib/clang/include/lld/Config/Version.inc (revision 314269)
@@ -1,8 +1,8 @@
// $FreeBSD$
#define LLD_VERSION 4.0.0
#define LLD_VERSION_STRING "4.0.0"
#define LLD_VERSION_MAJOR 4
#define LLD_VERSION_MINOR 0
-#define LLD_REVISION_STRING "296002"
+#define LLD_REVISION_STRING "296202"
#define LLD_REPOSITORY_STRING "FreeBSD"

File Metadata

Mime Type
text/x-c++
Expires
Fri, Nov 7, 7:01 AM (2 d)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
24918614
Default Alt Text
(706 KB)

Event Timeline