Page Menu
Home
FreeBSD
Search
Configure Global Search
Log In
Files
F134932237
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Flag For Later
Award Token
Size
706 KB
Referenced Files
None
Subscribers
None
View Options
This file is larger than 256 KB, so syntax highlighting was skipped.
Index: projects/clang400-import/contrib/compiler-rt
===================================================================
--- projects/clang400-import/contrib/compiler-rt (revision 314268)
+++ projects/clang400-import/contrib/compiler-rt (revision 314269)
Property changes on: projects/clang400-import/contrib/compiler-rt
___________________________________________________________________
Modified: svn:mergeinfo
## -0,0 +0,1 ##
Merged /vendor/compiler-rt/dist:r314177-314268
Index: projects/clang400-import/contrib/libc++
===================================================================
--- projects/clang400-import/contrib/libc++ (revision 314268)
+++ projects/clang400-import/contrib/libc++ (revision 314269)
Property changes on: projects/clang400-import/contrib/libc++
___________________________________________________________________
Modified: svn:mergeinfo
## -0,0 +0,1 ##
Merged /vendor/libc++/dist:r314177-314268
Index: projects/clang400-import/contrib/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h
===================================================================
--- projects/clang400-import/contrib/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h (revision 314268)
+++ projects/clang400-import/contrib/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h (revision 314269)
@@ -1,118 +1,112 @@
//===---- SLPVectorizer.h ---------------------------------------*- C++ -*-===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
// This pass implements the Bottom Up SLP vectorizer. It detects consecutive
// stores that can be put together into vector-stores. Next, it attempts to
// construct vectorizable tree using the use-def chains. If a profitable tree
// was found, the SLP vectorizer performs vectorization on the tree.
//
// The pass is inspired by the work described in the paper:
// "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks.
//
//===----------------------------------------------------------------------===//
#ifndef LLVM_TRANSFORMS_VECTORIZE_SLPVECTORIZER_H
#define LLVM_TRANSFORMS_VECTORIZE_SLPVECTORIZER_H
#include "llvm/ADT/MapVector.h"
#include "llvm/Analysis/AliasAnalysis.h"
#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/Analysis/DemandedBits.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/PassManager.h"
namespace llvm {
/// A private "module" namespace for types and utilities used by this pass.
/// These are implementation details and should not be used by clients.
namespace slpvectorizer {
class BoUpSLP;
}
struct SLPVectorizerPass : public PassInfoMixin<SLPVectorizerPass> {
typedef SmallVector<StoreInst *, 8> StoreList;
typedef MapVector<Value *, StoreList> StoreListMap;
typedef SmallVector<WeakVH, 8> WeakVHList;
typedef MapVector<Value *, WeakVHList> WeakVHListMap;
ScalarEvolution *SE = nullptr;
TargetTransformInfo *TTI = nullptr;
TargetLibraryInfo *TLI = nullptr;
AliasAnalysis *AA = nullptr;
LoopInfo *LI = nullptr;
DominatorTree *DT = nullptr;
AssumptionCache *AC = nullptr;
DemandedBits *DB = nullptr;
const DataLayout *DL = nullptr;
public:
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
// Glue for old PM.
bool runImpl(Function &F, ScalarEvolution *SE_, TargetTransformInfo *TTI_,
TargetLibraryInfo *TLI_, AliasAnalysis *AA_, LoopInfo *LI_,
DominatorTree *DT_, AssumptionCache *AC_, DemandedBits *DB_);
private:
/// \brief Collect store and getelementptr instructions and organize them
/// according to the underlying object of their pointer operands. We sort the
/// instructions by their underlying objects to reduce the cost of
/// consecutive access queries.
///
/// TODO: We can further reduce this cost if we flush the chain creation
/// every time we run into a memory barrier.
void collectSeedInstructions(BasicBlock *BB);
/// \brief Try to vectorize a chain that starts at two arithmetic instrs.
bool tryToVectorizePair(Value *A, Value *B, slpvectorizer::BoUpSLP &R);
/// \brief Try to vectorize a list of operands.
/// \@param BuildVector A list of users to ignore for the purpose of
/// scheduling and that don't need extracting.
/// \returns true if a value was vectorized.
bool tryToVectorizeList(ArrayRef<Value *> VL, slpvectorizer::BoUpSLP &R,
ArrayRef<Value *> BuildVector = None,
bool AllowReorder = false);
/// \brief Try to vectorize a chain that may start at the operands of \V;
bool tryToVectorize(BinaryOperator *V, slpvectorizer::BoUpSLP &R);
/// \brief Vectorize the store instructions collected in Stores.
bool vectorizeStoreChains(slpvectorizer::BoUpSLP &R);
/// \brief Vectorize the index computations of the getelementptr instructions
/// collected in GEPs.
bool vectorizeGEPIndices(BasicBlock *BB, slpvectorizer::BoUpSLP &R);
- /// Try to find horizontal reduction or otherwise vectorize a chain of binary
- /// operators.
- bool vectorizeRootInstruction(PHINode *P, Value *V, BasicBlock *BB,
- slpvectorizer::BoUpSLP &R,
- TargetTransformInfo *TTI);
-
/// \brief Scan the basic block and look for patterns that are likely to start
/// a vectorization chain.
bool vectorizeChainsInBlock(BasicBlock *BB, slpvectorizer::BoUpSLP &R);
bool vectorizeStoreChain(ArrayRef<Value *> Chain, slpvectorizer::BoUpSLP &R,
unsigned VecRegSize);
bool vectorizeStores(ArrayRef<StoreInst *> Stores, slpvectorizer::BoUpSLP &R);
/// The store instructions in a basic block organized by base pointer.
StoreListMap Stores;
/// The getelementptr instructions in a basic block organized by base pointer.
WeakVHListMap GEPs;
};
}
#endif // LLVM_TRANSFORMS_VECTORIZE_SLPVECTORIZER_H
Index: projects/clang400-import/contrib/llvm/lib/Target/AMDGPU/SIInstructions.td
===================================================================
--- projects/clang400-import/contrib/llvm/lib/Target/AMDGPU/SIInstructions.td (revision 314268)
+++ projects/clang400-import/contrib/llvm/lib/Target/AMDGPU/SIInstructions.td (revision 314269)
@@ -1,1094 +1,1099 @@
//===-- SIInstructions.td - SI Instruction Defintions ---------------------===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
// This file was originally auto-generated from a GPU register header file and
// all the instruction definitions were originally commented out. Instructions
// that are not yet supported remain commented out.
//===----------------------------------------------------------------------===//
def isGCN : Predicate<"Subtarget->getGeneration() "
">= SISubtarget::SOUTHERN_ISLANDS">,
AssemblerPredicate<"FeatureGCN">;
def isSI : Predicate<"Subtarget->getGeneration() "
"== SISubtarget::SOUTHERN_ISLANDS">,
AssemblerPredicate<"FeatureSouthernIslands">;
def has16BankLDS : Predicate<"Subtarget->getLDSBankCount() == 16">;
def has32BankLDS : Predicate<"Subtarget->getLDSBankCount() == 32">;
def HasVGPRIndexMode : Predicate<"Subtarget->hasVGPRIndexMode()">,
AssemblerPredicate<"FeatureVGPRIndexMode">;
def HasMovrel : Predicate<"Subtarget->hasMovrel()">,
AssemblerPredicate<"FeatureMovrel">;
include "VOPInstructions.td"
include "SOPInstructions.td"
include "SMInstructions.td"
include "FLATInstructions.td"
include "BUFInstructions.td"
let SubtargetPredicate = isGCN in {
//===----------------------------------------------------------------------===//
// EXP Instructions
//===----------------------------------------------------------------------===//
defm EXP : EXP_m<0, AMDGPUexport>;
defm EXP_DONE : EXP_m<1, AMDGPUexport_done>;
//===----------------------------------------------------------------------===//
// VINTRP Instructions
//===----------------------------------------------------------------------===//
let Uses = [M0, EXEC] in {
// FIXME: Specify SchedRW for VINTRP insturctions.
multiclass V_INTERP_P1_F32_m : VINTRP_m <
0x00000000,
(outs VGPR_32:$vdst),
(ins VGPR_32:$vsrc, Attr:$attr, AttrChan:$attrchan),
"v_interp_p1_f32 $vdst, $vsrc, $attr$attrchan",
[(set f32:$vdst, (AMDGPUinterp_p1 f32:$vsrc, (i32 imm:$attrchan),
(i32 imm:$attr)))]
>;
let OtherPredicates = [has32BankLDS] in {
defm V_INTERP_P1_F32 : V_INTERP_P1_F32_m;
} // End OtherPredicates = [has32BankLDS]
let OtherPredicates = [has16BankLDS], Constraints = "@earlyclobber $vdst", isAsmParserOnly=1 in {
defm V_INTERP_P1_F32_16bank : V_INTERP_P1_F32_m;
} // End OtherPredicates = [has32BankLDS], Constraints = "@earlyclobber $vdst", isAsmParserOnly=1
let DisableEncoding = "$src0", Constraints = "$src0 = $vdst" in {
defm V_INTERP_P2_F32 : VINTRP_m <
0x00000001,
(outs VGPR_32:$vdst),
(ins VGPR_32:$src0, VGPR_32:$vsrc, Attr:$attr, AttrChan:$attrchan),
"v_interp_p2_f32 $vdst, $vsrc, $attr$attrchan",
[(set f32:$vdst, (AMDGPUinterp_p2 f32:$src0, f32:$vsrc, (i32 imm:$attrchan),
(i32 imm:$attr)))]>;
} // End DisableEncoding = "$src0", Constraints = "$src0 = $vdst"
defm V_INTERP_MOV_F32 : VINTRP_m <
0x00000002,
(outs VGPR_32:$vdst),
(ins InterpSlot:$vsrc, Attr:$attr, AttrChan:$attrchan),
"v_interp_mov_f32 $vdst, $vsrc, $attr$attrchan",
[(set f32:$vdst, (AMDGPUinterp_mov (i32 imm:$vsrc), (i32 imm:$attrchan),
(i32 imm:$attr)))]>;
} // End Uses = [M0, EXEC]
//===----------------------------------------------------------------------===//
// Pseudo Instructions
//===----------------------------------------------------------------------===//
let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] in {
// For use in patterns
def V_CNDMASK_B64_PSEUDO : VOP3Common <(outs VReg_64:$vdst),
(ins VSrc_b64:$src0, VSrc_b64:$src1, SSrc_b64:$src2), "", []> {
let isPseudo = 1;
let isCodeGenOnly = 1;
let usesCustomInserter = 1;
}
// 64-bit vector move instruction. This is mainly used by the SIFoldOperands
// pass to enable folding of inline immediates.
def V_MOV_B64_PSEUDO : VPseudoInstSI <(outs VReg_64:$vdst),
(ins VSrc_b64:$src0)>;
} // End let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC]
let usesCustomInserter = 1, SALU = 1 in {
def GET_GROUPSTATICSIZE : PseudoInstSI <(outs SReg_32:$sdst), (ins),
[(set SReg_32:$sdst, (int_amdgcn_groupstaticsize))]>;
} // End let usesCustomInserter = 1, SALU = 1
def S_MOV_B64_term : PseudoInstSI<(outs SReg_64:$dst),
(ins SSrc_b64:$src0)> {
let SALU = 1;
let isAsCheapAsAMove = 1;
let isTerminator = 1;
}
def S_XOR_B64_term : PseudoInstSI<(outs SReg_64:$dst),
(ins SSrc_b64:$src0, SSrc_b64:$src1)> {
let SALU = 1;
let isAsCheapAsAMove = 1;
let isTerminator = 1;
}
def S_ANDN2_B64_term : PseudoInstSI<(outs SReg_64:$dst),
(ins SSrc_b64:$src0, SSrc_b64:$src1)> {
let SALU = 1;
let isAsCheapAsAMove = 1;
let isTerminator = 1;
}
def WAVE_BARRIER : SPseudoInstSI<(outs), (ins),
[(int_amdgcn_wave_barrier)]> {
let SchedRW = [];
let hasNoSchedulingInfo = 1;
let hasSideEffects = 1;
let mayLoad = 1;
let mayStore = 1;
let isBarrier = 1;
let isConvergent = 1;
}
// SI pseudo instructions. These are used by the CFG structurizer pass
// and should be lowered to ISA instructions prior to codegen.
// Dummy terminator instruction to use after control flow instructions
// replaced with exec mask operations.
def SI_MASK_BRANCH : PseudoInstSI <
(outs), (ins brtarget:$target)> {
let isBranch = 0;
let isTerminator = 1;
let isBarrier = 0;
let Uses = [EXEC];
let SchedRW = [];
let hasNoSchedulingInfo = 1;
}
let isTerminator = 1 in {
def SI_IF: CFPseudoInstSI <
(outs SReg_64:$dst), (ins SReg_64:$vcc, brtarget:$target),
[(set i64:$dst, (int_amdgcn_if i1:$vcc, bb:$target))], 1, 1> {
let Constraints = "";
let Size = 12;
let mayLoad = 1;
let mayStore = 1;
let hasSideEffects = 1;
}
def SI_ELSE : CFPseudoInstSI <
(outs SReg_64:$dst), (ins SReg_64:$src, brtarget:$target, i1imm:$execfix), [], 1, 1> {
let Constraints = "$src = $dst";
let Size = 12;
let mayStore = 1;
let mayLoad = 1;
let hasSideEffects = 1;
}
def SI_LOOP : CFPseudoInstSI <
(outs), (ins SReg_64:$saved, brtarget:$target),
[(int_amdgcn_loop i64:$saved, bb:$target)], 1, 1> {
let Size = 8;
let isBranch = 1;
let hasSideEffects = 1;
let mayLoad = 1;
let mayStore = 1;
}
} // End isBranch = 1, isTerminator = 1
def SI_END_CF : CFPseudoInstSI <
(outs), (ins SReg_64:$saved),
[(int_amdgcn_end_cf i64:$saved)], 1, 1> {
let Size = 4;
let isAsCheapAsAMove = 1;
let isReMaterializable = 1;
let mayLoad = 1;
let mayStore = 1;
let hasSideEffects = 1;
}
def SI_BREAK : CFPseudoInstSI <
(outs SReg_64:$dst), (ins SReg_64:$src),
[(set i64:$dst, (int_amdgcn_break i64:$src))], 1> {
let Size = 4;
let isAsCheapAsAMove = 1;
let isReMaterializable = 1;
}
def SI_IF_BREAK : CFPseudoInstSI <
(outs SReg_64:$dst), (ins SReg_64:$vcc, SReg_64:$src),
[(set i64:$dst, (int_amdgcn_if_break i1:$vcc, i64:$src))]> {
let Size = 4;
let isAsCheapAsAMove = 1;
let isReMaterializable = 1;
}
def SI_ELSE_BREAK : CFPseudoInstSI <
(outs SReg_64:$dst), (ins SReg_64:$src0, SReg_64:$src1),
[(set i64:$dst, (int_amdgcn_else_break i64:$src0, i64:$src1))]> {
let Size = 4;
let isAsCheapAsAMove = 1;
let isReMaterializable = 1;
}
let Uses = [EXEC], Defs = [EXEC,VCC] in {
def SI_KILL : PseudoInstSI <
(outs), (ins VSrc_b32:$src),
[(AMDGPUkill i32:$src)]> {
let isConvergent = 1;
let usesCustomInserter = 1;
}
def SI_KILL_TERMINATOR : SPseudoInstSI <
(outs), (ins VSrc_b32:$src)> {
let isTerminator = 1;
}
} // End Uses = [EXEC], Defs = [EXEC,VCC]
// Branch on undef scc. Used to avoid intermediate copy from
// IMPLICIT_DEF to SCC.
def SI_BR_UNDEF : SPseudoInstSI <(outs), (ins sopp_brtarget:$simm16)> {
let isTerminator = 1;
let usesCustomInserter = 1;
}
def SI_PS_LIVE : PseudoInstSI <
(outs SReg_64:$dst), (ins),
[(set i1:$dst, (int_amdgcn_ps_live))]> {
let SALU = 1;
}
// Used as an isel pseudo to directly emit initialization with an
// s_mov_b32 rather than a copy of another initialized
// register. MachineCSE skips copies, and we don't want to have to
// fold operands before it runs.
def SI_INIT_M0 : SPseudoInstSI <(outs), (ins SSrc_b32:$src)> {
let Defs = [M0];
let usesCustomInserter = 1;
let isAsCheapAsAMove = 1;
let isReMaterializable = 1;
}
def SI_RETURN : SPseudoInstSI <
(outs), (ins variable_ops), [(AMDGPUreturn)]> {
let isTerminator = 1;
let isBarrier = 1;
let isReturn = 1;
let hasSideEffects = 1;
let hasNoSchedulingInfo = 1;
let DisableWQM = 1;
}
let Defs = [M0, EXEC],
UseNamedOperandTable = 1 in {
class SI_INDIRECT_SRC<RegisterClass rc> : VPseudoInstSI <
(outs VGPR_32:$vdst),
(ins rc:$src, VS_32:$idx, i32imm:$offset)> {
let usesCustomInserter = 1;
}
class SI_INDIRECT_DST<RegisterClass rc> : VPseudoInstSI <
(outs rc:$vdst),
(ins rc:$src, VS_32:$idx, i32imm:$offset, VGPR_32:$val)> {
let Constraints = "$src = $vdst";
let usesCustomInserter = 1;
}
// TODO: We can support indirect SGPR access.
def SI_INDIRECT_SRC_V1 : SI_INDIRECT_SRC<VGPR_32>;
def SI_INDIRECT_SRC_V2 : SI_INDIRECT_SRC<VReg_64>;
def SI_INDIRECT_SRC_V4 : SI_INDIRECT_SRC<VReg_128>;
def SI_INDIRECT_SRC_V8 : SI_INDIRECT_SRC<VReg_256>;
def SI_INDIRECT_SRC_V16 : SI_INDIRECT_SRC<VReg_512>;
def SI_INDIRECT_DST_V1 : SI_INDIRECT_DST<VGPR_32>;
def SI_INDIRECT_DST_V2 : SI_INDIRECT_DST<VReg_64>;
def SI_INDIRECT_DST_V4 : SI_INDIRECT_DST<VReg_128>;
def SI_INDIRECT_DST_V8 : SI_INDIRECT_DST<VReg_256>;
def SI_INDIRECT_DST_V16 : SI_INDIRECT_DST<VReg_512>;
} // End Uses = [EXEC], Defs = [M0, EXEC]
multiclass SI_SPILL_SGPR <RegisterClass sgpr_class> {
let UseNamedOperandTable = 1, SGPRSpill = 1, Uses = [EXEC] in {
def _SAVE : PseudoInstSI <
(outs),
(ins sgpr_class:$data, i32imm:$addr)> {
let mayStore = 1;
let mayLoad = 0;
}
def _RESTORE : PseudoInstSI <
(outs sgpr_class:$data),
(ins i32imm:$addr)> {
let mayStore = 0;
let mayLoad = 1;
}
} // End UseNamedOperandTable = 1
}
// You cannot use M0 as the output of v_readlane_b32 instructions or
// use it in the sdata operand of SMEM instructions. We still need to
// be able to spill the physical register m0, so allow it for
// SI_SPILL_32_* instructions.
defm SI_SPILL_S32 : SI_SPILL_SGPR <SReg_32>;
defm SI_SPILL_S64 : SI_SPILL_SGPR <SReg_64>;
defm SI_SPILL_S128 : SI_SPILL_SGPR <SReg_128>;
defm SI_SPILL_S256 : SI_SPILL_SGPR <SReg_256>;
defm SI_SPILL_S512 : SI_SPILL_SGPR <SReg_512>;
multiclass SI_SPILL_VGPR <RegisterClass vgpr_class> {
let UseNamedOperandTable = 1, VGPRSpill = 1,
SchedRW = [WriteVMEM] in {
def _SAVE : VPseudoInstSI <
(outs),
(ins vgpr_class:$vdata, i32imm:$vaddr, SReg_128:$srsrc,
SReg_32:$soffset, i32imm:$offset)> {
let mayStore = 1;
let mayLoad = 0;
// (2 * 4) + (8 * num_subregs) bytes maximum
let Size = !add(!shl(!srl(vgpr_class.Size, 5), 3), 8);
}
def _RESTORE : VPseudoInstSI <
(outs vgpr_class:$vdata),
(ins i32imm:$vaddr, SReg_128:$srsrc, SReg_32:$soffset,
i32imm:$offset)> {
let mayStore = 0;
let mayLoad = 1;
// (2 * 4) + (8 * num_subregs) bytes maximum
let Size = !add(!shl(!srl(vgpr_class.Size, 5), 3), 8);
}
} // End UseNamedOperandTable = 1, VGPRSpill = 1, SchedRW = [WriteVMEM]
}
defm SI_SPILL_V32 : SI_SPILL_VGPR <VGPR_32>;
defm SI_SPILL_V64 : SI_SPILL_VGPR <VReg_64>;
defm SI_SPILL_V96 : SI_SPILL_VGPR <VReg_96>;
defm SI_SPILL_V128 : SI_SPILL_VGPR <VReg_128>;
defm SI_SPILL_V256 : SI_SPILL_VGPR <VReg_256>;
defm SI_SPILL_V512 : SI_SPILL_VGPR <VReg_512>;
def SI_PC_ADD_REL_OFFSET : SPseudoInstSI <
(outs SReg_64:$dst),
(ins si_ga:$ptr_lo, si_ga:$ptr_hi),
[(set SReg_64:$dst,
(i64 (SIpc_add_rel_offset (tglobaladdr:$ptr_lo), (tglobaladdr:$ptr_hi))))]> {
let Defs = [SCC];
}
} // End SubtargetPredicate = isGCN
let Predicates = [isGCN] in {
def : Pat<
(int_amdgcn_else i64:$src, bb:$target),
(SI_ELSE $src, $target, 0)
>;
def : Pat <
(int_AMDGPU_kilp),
(SI_KILL (i32 0xbf800000))
>;
//===----------------------------------------------------------------------===//
// VOP1 Patterns
//===----------------------------------------------------------------------===//
let Predicates = [UnsafeFPMath] in {
//def : RcpPat<V_RCP_F64_e32, f64>;
//defm : RsqPat<V_RSQ_F64_e32, f64>;
//defm : RsqPat<V_RSQ_F32_e32, f32>;
def : RsqPat<V_RSQ_F32_e32, f32>;
def : RsqPat<V_RSQ_F64_e32, f64>;
// Convert (x - floor(x)) to fract(x)
def : Pat <
(f32 (fsub (f32 (VOP3Mods f32:$x, i32:$mods)),
(f32 (ffloor (f32 (VOP3Mods f32:$x, i32:$mods)))))),
(V_FRACT_F32_e64 $mods, $x, DSTCLAMP.NONE, DSTOMOD.NONE)
>;
// Convert (x + (-floor(x))) to fract(x)
def : Pat <
(f64 (fadd (f64 (VOP3Mods f64:$x, i32:$mods)),
(f64 (fneg (f64 (ffloor (f64 (VOP3Mods f64:$x, i32:$mods)))))))),
(V_FRACT_F64_e64 $mods, $x, DSTCLAMP.NONE, DSTOMOD.NONE)
>;
} // End Predicates = [UnsafeFPMath]
def : Pat <
(f32 (fpextend f16:$src)),
(V_CVT_F32_F16_e32 $src)
>;
def : Pat <
(f64 (fpextend f16:$src)),
(V_CVT_F64_F32_e32 (V_CVT_F32_F16_e32 $src))
>;
def : Pat <
(f16 (fpround f32:$src)),
(V_CVT_F16_F32_e32 $src)
>;
def : Pat <
(f16 (fpround f64:$src)),
(V_CVT_F16_F32_e32 (V_CVT_F32_F64_e32 $src))
>;
def : Pat <
(i32 (fp_to_sint f16:$src)),
(V_CVT_I32_F32_e32 (V_CVT_F32_F16_e32 $src))
>;
def : Pat <
(i32 (fp_to_uint f16:$src)),
(V_CVT_U32_F32_e32 (V_CVT_F32_F16_e32 $src))
>;
def : Pat <
(f16 (sint_to_fp i32:$src)),
(V_CVT_F16_F32_e32 (V_CVT_F32_I32_e32 $src))
>;
def : Pat <
(f16 (uint_to_fp i32:$src)),
(V_CVT_F16_F32_e32 (V_CVT_F32_U32_e32 $src))
>;
//===----------------------------------------------------------------------===//
// VOP2 Patterns
//===----------------------------------------------------------------------===//
multiclass FMADPat <ValueType vt, Instruction inst> {
def : Pat <
(vt (fmad (VOP3NoMods0 vt:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod),
(VOP3NoMods vt:$src1, i32:$src1_modifiers),
(VOP3NoMods vt:$src2, i32:$src2_modifiers))),
(inst $src0_modifiers, $src0, $src1_modifiers, $src1,
$src2_modifiers, $src2, $clamp, $omod)
>;
}
defm : FMADPat <f16, V_MAC_F16_e64>;
defm : FMADPat <f32, V_MAC_F32_e64>;
multiclass SelectPat <ValueType vt, Instruction inst> {
def : Pat <
(vt (select i1:$src0, vt:$src1, vt:$src2)),
(inst $src2, $src1, $src0)
>;
}
defm : SelectPat <i16, V_CNDMASK_B32_e64>;
defm : SelectPat <i32, V_CNDMASK_B32_e64>;
defm : SelectPat <f16, V_CNDMASK_B32_e64>;
defm : SelectPat <f32, V_CNDMASK_B32_e64>;
def : Pat <
(i32 (add (i32 (ctpop i32:$popcnt)), i32:$val)),
(V_BCNT_U32_B32_e64 $popcnt, $val)
>;
/********** ============================================ **********/
/********** Extraction, Insertion, Building and Casting **********/
/********** ============================================ **********/
foreach Index = 0-2 in {
def Extract_Element_v2i32_#Index : Extract_Element <
i32, v2i32, Index, !cast<SubRegIndex>(sub#Index)
>;
def Insert_Element_v2i32_#Index : Insert_Element <
i32, v2i32, Index, !cast<SubRegIndex>(sub#Index)
>;
def Extract_Element_v2f32_#Index : Extract_Element <
f32, v2f32, Index, !cast<SubRegIndex>(sub#Index)
>;
def Insert_Element_v2f32_#Index : Insert_Element <
f32, v2f32, Index, !cast<SubRegIndex>(sub#Index)
>;
}
foreach Index = 0-3 in {
def Extract_Element_v4i32_#Index : Extract_Element <
i32, v4i32, Index, !cast<SubRegIndex>(sub#Index)
>;
def Insert_Element_v4i32_#Index : Insert_Element <
i32, v4i32, Index, !cast<SubRegIndex>(sub#Index)
>;
def Extract_Element_v4f32_#Index : Extract_Element <
f32, v4f32, Index, !cast<SubRegIndex>(sub#Index)
>;
def Insert_Element_v4f32_#Index : Insert_Element <
f32, v4f32, Index, !cast<SubRegIndex>(sub#Index)
>;
}
foreach Index = 0-7 in {
def Extract_Element_v8i32_#Index : Extract_Element <
i32, v8i32, Index, !cast<SubRegIndex>(sub#Index)
>;
def Insert_Element_v8i32_#Index : Insert_Element <
i32, v8i32, Index, !cast<SubRegIndex>(sub#Index)
>;
def Extract_Element_v8f32_#Index : Extract_Element <
f32, v8f32, Index, !cast<SubRegIndex>(sub#Index)
>;
def Insert_Element_v8f32_#Index : Insert_Element <
f32, v8f32, Index, !cast<SubRegIndex>(sub#Index)
>;
}
foreach Index = 0-15 in {
def Extract_Element_v16i32_#Index : Extract_Element <
i32, v16i32, Index, !cast<SubRegIndex>(sub#Index)
>;
def Insert_Element_v16i32_#Index : Insert_Element <
i32, v16i32, Index, !cast<SubRegIndex>(sub#Index)
>;
def Extract_Element_v16f32_#Index : Extract_Element <
f32, v16f32, Index, !cast<SubRegIndex>(sub#Index)
>;
def Insert_Element_v16f32_#Index : Insert_Element <
f32, v16f32, Index, !cast<SubRegIndex>(sub#Index)
>;
}
// FIXME: Why do only some of these type combinations for SReg and
// VReg?
// 16-bit bitcast
def : BitConvert <i16, f16, VGPR_32>;
def : BitConvert <f16, i16, VGPR_32>;
def : BitConvert <i16, f16, SReg_32>;
def : BitConvert <f16, i16, SReg_32>;
// 32-bit bitcast
def : BitConvert <i32, f32, VGPR_32>;
def : BitConvert <f32, i32, VGPR_32>;
def : BitConvert <i32, f32, SReg_32>;
def : BitConvert <f32, i32, SReg_32>;
// 64-bit bitcast
def : BitConvert <i64, f64, VReg_64>;
def : BitConvert <f64, i64, VReg_64>;
def : BitConvert <v2i32, v2f32, VReg_64>;
def : BitConvert <v2f32, v2i32, VReg_64>;
def : BitConvert <i64, v2i32, VReg_64>;
def : BitConvert <v2i32, i64, VReg_64>;
def : BitConvert <i64, v2f32, VReg_64>;
def : BitConvert <v2f32, i64, VReg_64>;
def : BitConvert <f64, v2f32, VReg_64>;
def : BitConvert <v2f32, f64, VReg_64>;
def : BitConvert <f64, v2i32, VReg_64>;
def : BitConvert <v2i32, f64, VReg_64>;
def : BitConvert <v4i32, v4f32, VReg_128>;
def : BitConvert <v4f32, v4i32, VReg_128>;
// 128-bit bitcast
def : BitConvert <v2i64, v4i32, SReg_128>;
def : BitConvert <v4i32, v2i64, SReg_128>;
def : BitConvert <v2f64, v4f32, VReg_128>;
def : BitConvert <v2f64, v4i32, VReg_128>;
def : BitConvert <v4f32, v2f64, VReg_128>;
def : BitConvert <v4i32, v2f64, VReg_128>;
def : BitConvert <v2i64, v2f64, VReg_128>;
def : BitConvert <v2f64, v2i64, VReg_128>;
// 256-bit bitcast
def : BitConvert <v8i32, v8f32, SReg_256>;
def : BitConvert <v8f32, v8i32, SReg_256>;
def : BitConvert <v8i32, v8f32, VReg_256>;
def : BitConvert <v8f32, v8i32, VReg_256>;
// 512-bit bitcast
def : BitConvert <v16i32, v16f32, VReg_512>;
def : BitConvert <v16f32, v16i32, VReg_512>;
/********** =================== **********/
/********** Src & Dst modifiers **********/
/********** =================== **********/
def : Pat <
(AMDGPUclamp (VOP3Mods0Clamp f32:$src0, i32:$src0_modifiers, i32:$omod),
(f32 FP_ZERO), (f32 FP_ONE)),
(V_ADD_F32_e64 $src0_modifiers, $src0, 0, (i32 0), 1, $omod)
>;
/********** ================================ **********/
/********** Floating point absolute/negative **********/
/********** ================================ **********/
// Prevent expanding both fneg and fabs.
def : Pat <
(fneg (fabs f32:$src)),
(S_OR_B32 $src, (S_MOV_B32(i32 0x80000000))) // Set sign bit
>;
// FIXME: Should use S_OR_B32
def : Pat <
(fneg (fabs f64:$src)),
(REG_SEQUENCE VReg_64,
(i32 (EXTRACT_SUBREG f64:$src, sub0)),
sub0,
(V_OR_B32_e32 (i32 (EXTRACT_SUBREG f64:$src, sub1)),
(V_MOV_B32_e32 (i32 0x80000000))), // Set sign bit.
sub1)
>;
def : Pat <
(fabs f32:$src),
(V_AND_B32_e64 $src, (V_MOV_B32_e32 (i32 0x7fffffff)))
>;
def : Pat <
(fneg f32:$src),
(V_XOR_B32_e32 $src, (V_MOV_B32_e32 (i32 0x80000000)))
>;
def : Pat <
(fabs f64:$src),
(REG_SEQUENCE VReg_64,
(i32 (EXTRACT_SUBREG f64:$src, sub0)),
sub0,
(V_AND_B32_e64 (i32 (EXTRACT_SUBREG f64:$src, sub1)),
(V_MOV_B32_e32 (i32 0x7fffffff))), // Set sign bit.
sub1)
>;
def : Pat <
(fneg f64:$src),
(REG_SEQUENCE VReg_64,
(i32 (EXTRACT_SUBREG f64:$src, sub0)),
sub0,
(V_XOR_B32_e32 (i32 (EXTRACT_SUBREG f64:$src, sub1)),
(i32 (V_MOV_B32_e32 (i32 0x80000000)))),
sub1)
>;
def : Pat <
(fneg f16:$src),
(V_XOR_B32_e32 $src, (V_MOV_B32_e32 (i32 0x00008000)))
>;
def : Pat <
(fabs f16:$src),
(V_AND_B32_e64 $src, (V_MOV_B32_e32 (i32 0x00007fff)))
>;
def : Pat <
(fneg (fabs f16:$src)),
(S_OR_B32 $src, (S_MOV_B32 (i32 0x00008000))) // Set sign bit
>;
/********** ================== **********/
/********** Immediate Patterns **********/
/********** ================== **********/
def : Pat <
(VGPRImm<(i32 imm)>:$imm),
(V_MOV_B32_e32 imm:$imm)
>;
def : Pat <
(VGPRImm<(f32 fpimm)>:$imm),
(V_MOV_B32_e32 (f32 (bitcast_fpimm_to_i32 $imm)))
>;
def : Pat <
(i32 imm:$imm),
(S_MOV_B32 imm:$imm)
>;
// FIXME: Workaround for ordering issue with peephole optimizer where
// a register class copy interferes with immediate folding. Should
// use s_mov_b32, which can be shrunk to s_movk_i32
def : Pat <
(VGPRImm<(f16 fpimm)>:$imm),
(V_MOV_B32_e32 (f16 (bitcast_fpimm_to_i32 $imm)))
>;
def : Pat <
(f32 fpimm:$imm),
(S_MOV_B32 (f32 (bitcast_fpimm_to_i32 $imm)))
>;
def : Pat <
(f16 fpimm:$imm),
(S_MOV_B32 (i32 (bitcast_fpimm_to_i32 $imm)))
>;
def : Pat <
(i32 frameindex:$fi),
(V_MOV_B32_e32 (i32 (frameindex_to_targetframeindex $fi)))
>;
def : Pat <
(i64 InlineImm<i64>:$imm),
(S_MOV_B64 InlineImm<i64>:$imm)
>;
// XXX - Should this use a s_cmp to set SCC?
// Set to sign-extended 64-bit value (true = -1, false = 0)
def : Pat <
(i1 imm:$imm),
(S_MOV_B64 (i64 (as_i64imm $imm)))
>;
def : Pat <
(f64 InlineFPImm<f64>:$imm),
(S_MOV_B64 (f64 (bitcast_fpimm_to_i64 InlineFPImm<f64>:$imm)))
>;
/********** ================== **********/
/********** Intrinsic Patterns **********/
/********** ================== **********/
def : POW_Common <V_LOG_F32_e32, V_EXP_F32_e32, V_MUL_LEGACY_F32_e32>;
def : Pat <
(int_AMDGPU_cube v4f32:$src),
(REG_SEQUENCE VReg_128,
(V_CUBETC_F32 0 /* src0_modifiers */, (f32 (EXTRACT_SUBREG $src, sub0)),
0 /* src1_modifiers */, (f32 (EXTRACT_SUBREG $src, sub1)),
0 /* src2_modifiers */, (f32 (EXTRACT_SUBREG $src, sub2)),
0 /* clamp */, 0 /* omod */), sub0,
(V_CUBESC_F32 0 /* src0_modifiers */, (f32 (EXTRACT_SUBREG $src, sub0)),
0 /* src1_modifiers */,(f32 (EXTRACT_SUBREG $src, sub1)),
0 /* src2_modifiers */,(f32 (EXTRACT_SUBREG $src, sub2)),
0 /* clamp */, 0 /* omod */), sub1,
(V_CUBEMA_F32 0 /* src1_modifiers */,(f32 (EXTRACT_SUBREG $src, sub0)),
0 /* src1_modifiers */,(f32 (EXTRACT_SUBREG $src, sub1)),
0 /* src1_modifiers */,(f32 (EXTRACT_SUBREG $src, sub2)),
0 /* clamp */, 0 /* omod */), sub2,
(V_CUBEID_F32 0 /* src1_modifiers */,(f32 (EXTRACT_SUBREG $src, sub0)),
0 /* src1_modifiers */,(f32 (EXTRACT_SUBREG $src, sub1)),
0 /* src1_modifiers */,(f32 (EXTRACT_SUBREG $src, sub2)),
0 /* clamp */, 0 /* omod */), sub3)
>;
def : Pat <
(i32 (sext i1:$src0)),
(V_CNDMASK_B32_e64 (i32 0), (i32 -1), $src0)
>;
class Ext32Pat <SDNode ext> : Pat <
(i32 (ext i1:$src0)),
(V_CNDMASK_B32_e64 (i32 0), (i32 1), $src0)
>;
def : Ext32Pat <zext>;
def : Ext32Pat <anyext>;
// The multiplication scales from [0,1] to the unsigned integer range
def : Pat <
(AMDGPUurecip i32:$src0),
(V_CVT_U32_F32_e32
(V_MUL_F32_e32 (i32 CONST.FP_UINT_MAX_PLUS_1),
(V_RCP_IFLAG_F32_e32 (V_CVT_F32_U32_e32 $src0))))
>;
//===----------------------------------------------------------------------===//
// VOP3 Patterns
//===----------------------------------------------------------------------===//
def : IMad24Pat<V_MAD_I32_I24>;
def : UMad24Pat<V_MAD_U32_U24>;
defm : BFIPatterns <V_BFI_B32, S_MOV_B32, SReg_64>;
def : ROTRPattern <V_ALIGNBIT_B32>;
/********** ====================== **********/
/********** Indirect addressing **********/
/********** ====================== **********/
multiclass SI_INDIRECT_Pattern <ValueType vt, ValueType eltvt, string VecSize> {
// Extract with offset
def : Pat<
(eltvt (extractelt vt:$src, (MOVRELOffset i32:$idx, (i32 imm:$offset)))),
(!cast<Instruction>("SI_INDIRECT_SRC_"#VecSize) $src, $idx, imm:$offset)
>;
// Insert with offset
def : Pat<
(insertelt vt:$src, eltvt:$val, (MOVRELOffset i32:$idx, (i32 imm:$offset))),
(!cast<Instruction>("SI_INDIRECT_DST_"#VecSize) $src, $idx, imm:$offset, $val)
>;
}
defm : SI_INDIRECT_Pattern <v2f32, f32, "V2">;
defm : SI_INDIRECT_Pattern <v4f32, f32, "V4">;
defm : SI_INDIRECT_Pattern <v8f32, f32, "V8">;
defm : SI_INDIRECT_Pattern <v16f32, f32, "V16">;
defm : SI_INDIRECT_Pattern <v2i32, i32, "V2">;
defm : SI_INDIRECT_Pattern <v4i32, i32, "V4">;
defm : SI_INDIRECT_Pattern <v8i32, i32, "V8">;
defm : SI_INDIRECT_Pattern <v16i32, i32, "V16">;
//===----------------------------------------------------------------------===//
// SAD Patterns
//===----------------------------------------------------------------------===//
def : Pat <
(add (sub_oneuse (umax i32:$src0, i32:$src1),
(umin i32:$src0, i32:$src1)),
i32:$src2),
(V_SAD_U32 $src0, $src1, $src2)
>;
def : Pat <
(add (select_oneuse (i1 (setugt i32:$src0, i32:$src1)),
(sub i32:$src0, i32:$src1),
(sub i32:$src1, i32:$src0)),
i32:$src2),
(V_SAD_U32 $src0, $src1, $src2)
>;
//===----------------------------------------------------------------------===//
// Conversion Patterns
//===----------------------------------------------------------------------===//
def : Pat<(i32 (sext_inreg i32:$src, i1)),
(S_BFE_I32 i32:$src, (i32 65536))>; // 0 | 1 << 16
// Handle sext_inreg in i64
def : Pat <
(i64 (sext_inreg i64:$src, i1)),
(S_BFE_I64 i64:$src, (i32 0x10000)) // 0 | 1 << 16
>;
def : Pat <
(i16 (sext_inreg i16:$src, i1)),
(S_BFE_I32 $src, (i32 0x00010000)) // 0 | 1 << 16
>;
def : Pat <
(i16 (sext_inreg i16:$src, i8)),
(S_BFE_I32 $src, (i32 0x80000)) // 0 | 8 << 16
>;
def : Pat <
(i64 (sext_inreg i64:$src, i8)),
(S_BFE_I64 i64:$src, (i32 0x80000)) // 0 | 8 << 16
>;
def : Pat <
(i64 (sext_inreg i64:$src, i16)),
(S_BFE_I64 i64:$src, (i32 0x100000)) // 0 | 16 << 16
>;
def : Pat <
(i64 (sext_inreg i64:$src, i32)),
(S_BFE_I64 i64:$src, (i32 0x200000)) // 0 | 32 << 16
>;
def : Pat <
(i64 (zext i32:$src)),
(REG_SEQUENCE SReg_64, $src, sub0, (S_MOV_B32 (i32 0)), sub1)
>;
def : Pat <
(i64 (anyext i32:$src)),
(REG_SEQUENCE SReg_64, $src, sub0, (i32 (IMPLICIT_DEF)), sub1)
>;
class ZExt_i64_i1_Pat <SDNode ext> : Pat <
(i64 (ext i1:$src)),
(REG_SEQUENCE VReg_64,
(V_CNDMASK_B32_e64 (i32 0), (i32 1), $src), sub0,
(S_MOV_B32 (i32 0)), sub1)
>;
def : ZExt_i64_i1_Pat<zext>;
def : ZExt_i64_i1_Pat<anyext>;
// FIXME: We need to use COPY_TO_REGCLASS to work-around the fact that
// REG_SEQUENCE patterns don't support instructions with multiple outputs.
def : Pat <
(i64 (sext i32:$src)),
(REG_SEQUENCE SReg_64, $src, sub0,
(i32 (COPY_TO_REGCLASS (S_ASHR_I32 $src, (i32 31)), SReg_32_XM0)), sub1)
>;
def : Pat <
(i64 (sext i1:$src)),
(REG_SEQUENCE VReg_64,
(V_CNDMASK_B32_e64 (i32 0), (i32 -1), $src), sub0,
(V_CNDMASK_B32_e64 (i32 0), (i32 -1), $src), sub1)
>;
class FPToI1Pat<Instruction Inst, int KOne, ValueType kone_type, ValueType vt, SDPatternOperator fp_to_int> : Pat <
(i1 (fp_to_int (vt (VOP3Mods vt:$src0, i32:$src0_modifiers)))),
(i1 (Inst 0, (kone_type KOne), $src0_modifiers, $src0, DSTCLAMP.NONE, DSTOMOD.NONE))
>;
def : FPToI1Pat<V_CMP_EQ_F32_e64, CONST.FP32_ONE, i32, f32, fp_to_uint>;
def : FPToI1Pat<V_CMP_EQ_F32_e64, CONST.FP32_NEG_ONE, i32, f32, fp_to_sint>;
def : FPToI1Pat<V_CMP_EQ_F64_e64, CONST.FP64_ONE, i64, f64, fp_to_uint>;
def : FPToI1Pat<V_CMP_EQ_F64_e64, CONST.FP64_NEG_ONE, i64, f64, fp_to_sint>;
// If we need to perform a logical operation on i1 values, we need to
// use vector comparisons since there is only one SCC register. Vector
// comparisons still write to a pair of SGPRs, so treat these as
// 64-bit comparisons. When legalizing SGPR copies, instructions
// resulting in the copies from SCC to these instructions will be
// moved to the VALU.
def : Pat <
(i1 (and i1:$src0, i1:$src1)),
(S_AND_B64 $src0, $src1)
>;
def : Pat <
(i1 (or i1:$src0, i1:$src1)),
(S_OR_B64 $src0, $src1)
>;
def : Pat <
(i1 (xor i1:$src0, i1:$src1)),
(S_XOR_B64 $src0, $src1)
>;
def : Pat <
(f32 (sint_to_fp i1:$src)),
(V_CNDMASK_B32_e64 (i32 0), (i32 CONST.FP32_NEG_ONE), $src)
>;
def : Pat <
(f32 (uint_to_fp i1:$src)),
(V_CNDMASK_B32_e64 (i32 0), (i32 CONST.FP32_ONE), $src)
>;
def : Pat <
(f64 (sint_to_fp i1:$src)),
(V_CVT_F64_I32_e32 (V_CNDMASK_B32_e64 (i32 0), (i32 -1), $src))
>;
def : Pat <
(f64 (uint_to_fp i1:$src)),
(V_CVT_F64_U32_e32 (V_CNDMASK_B32_e64 (i32 0), (i32 1), $src))
>;
//===----------------------------------------------------------------------===//
// Miscellaneous Patterns
//===----------------------------------------------------------------------===//
def : Pat <
(i32 (trunc i64:$a)),
(EXTRACT_SUBREG $a, sub0)
>;
def : Pat <
(i1 (trunc i32:$a)),
(V_CMP_EQ_U32_e64 (S_AND_B32 (i32 1), $a), (i32 1))
>;
def : Pat <
+ (i1 (trunc i16:$a)),
+ (V_CMP_EQ_U32_e64 (S_AND_B32 (i32 1), $a), (i32 1))
+>;
+
+def : Pat <
(i1 (trunc i64:$a)),
(V_CMP_EQ_U32_e64 (S_AND_B32 (i32 1),
(i32 (EXTRACT_SUBREG $a, sub0))), (i32 1))
>;
def : Pat <
(i32 (bswap i32:$a)),
(V_BFI_B32 (S_MOV_B32 (i32 0x00ff00ff)),
(V_ALIGNBIT_B32 $a, $a, (i32 24)),
(V_ALIGNBIT_B32 $a, $a, (i32 8)))
>;
multiclass BFMPatterns <ValueType vt, InstSI BFM, InstSI MOV> {
def : Pat <
(vt (shl (vt (add (vt (shl 1, vt:$a)), -1)), vt:$b)),
(BFM $a, $b)
>;
def : Pat <
(vt (add (vt (shl 1, vt:$a)), -1)),
(BFM $a, (MOV (i32 0)))
>;
}
defm : BFMPatterns <i32, S_BFM_B32, S_MOV_B32>;
// FIXME: defm : BFMPatterns <i64, S_BFM_B64, S_MOV_B64>;
def : BFEPattern <V_BFE_U32, S_MOV_B32>;
def : Pat<
(fcanonicalize f16:$src),
(V_MUL_F16_e64 0, (i32 CONST.FP16_ONE), 0, $src, 0, 0)
>;
def : Pat<
(fcanonicalize f32:$src),
(V_MUL_F32_e64 0, (i32 CONST.FP32_ONE), 0, $src, 0, 0)
>;
def : Pat<
(fcanonicalize f64:$src),
(V_MUL_F64 0, CONST.FP64_ONE, 0, $src, 0, 0)
>;
//===----------------------------------------------------------------------===//
// Fract Patterns
//===----------------------------------------------------------------------===//
let Predicates = [isSI] in {
// V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) is
// used instead. However, SI doesn't have V_FLOOR_F64, so the most efficient
// way to implement it is using V_FRACT_F64.
// The workaround for the V_FRACT bug is:
// fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
// Convert floor(x) to (x - fract(x))
def : Pat <
(f64 (ffloor (f64 (VOP3Mods f64:$x, i32:$mods)))),
(V_ADD_F64
$mods,
$x,
SRCMODS.NEG,
(V_CNDMASK_B64_PSEUDO
(V_MIN_F64
SRCMODS.NONE,
(V_FRACT_F64_e64 $mods, $x, DSTCLAMP.NONE, DSTOMOD.NONE),
SRCMODS.NONE,
(V_MOV_B64_PSEUDO 0x3fefffffffffffff),
DSTCLAMP.NONE, DSTOMOD.NONE),
$x,
(V_CMP_CLASS_F64_e64 SRCMODS.NONE, $x, (i32 3 /*NaN*/))),
DSTCLAMP.NONE, DSTOMOD.NONE)
>;
} // End Predicates = [isSI]
//============================================================================//
// Miscellaneous Optimization Patterns
//============================================================================//
def : SHA256MaPattern <V_BFI_B32, V_XOR_B32_e64>;
def : IntMed3Pat<V_MED3_I32, smax, smax_oneuse, smin_oneuse>;
def : IntMed3Pat<V_MED3_U32, umax, umax_oneuse, umin_oneuse>;
//============================================================================//
// Assembler aliases
//============================================================================//
def : MnemonicAlias<"v_add_u32", "v_add_i32">;
def : MnemonicAlias<"v_sub_u32", "v_sub_i32">;
def : MnemonicAlias<"v_subrev_u32", "v_subrev_i32">;
} // End isGCN predicate
Index: projects/clang400-import/contrib/llvm/lib/Target/AMDGPU/VOP1Instructions.td
===================================================================
--- projects/clang400-import/contrib/llvm/lib/Target/AMDGPU/VOP1Instructions.td (revision 314268)
+++ projects/clang400-import/contrib/llvm/lib/Target/AMDGPU/VOP1Instructions.td (revision 314269)
@@ -1,621 +1,615 @@
//===-- VOP1Instructions.td - Vector Instruction Defintions ---------------===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//===----------------------------------------------------------------------===//
// VOP1 Classes
//===----------------------------------------------------------------------===//
class VOP1e <bits<8> op, VOPProfile P> : Enc32 {
bits<8> vdst;
bits<9> src0;
let Inst{8-0} = !if(P.HasSrc0, src0{8-0}, 0);
let Inst{16-9} = op;
let Inst{24-17} = !if(P.EmitDst, vdst{7-0}, 0);
let Inst{31-25} = 0x3f; //encoding
}
class VOP1_SDWAe <bits<8> op, VOPProfile P> : VOP_SDWAe <P> {
bits<8> vdst;
let Inst{8-0} = 0xf9; // sdwa
let Inst{16-9} = op;
let Inst{24-17} = !if(P.EmitDst, vdst{7-0}, 0);
let Inst{31-25} = 0x3f; // encoding
}
class VOP1_Pseudo <string opName, VOPProfile P, list<dag> pattern=[]> :
InstSI <P.Outs32, P.Ins32, "", pattern>,
VOP <opName>,
SIMCInstr <opName#"_e32", SIEncodingFamily.NONE>,
MnemonicAlias<opName#"_e32", opName> {
let isPseudo = 1;
let isCodeGenOnly = 1;
let UseNamedOperandTable = 1;
string Mnemonic = opName;
string AsmOperands = P.Asm32;
let Size = 4;
let mayLoad = 0;
let mayStore = 0;
let hasSideEffects = 0;
let SubtargetPredicate = isGCN;
let VOP1 = 1;
let VALU = 1;
let Uses = [EXEC];
let AsmVariantName = AMDGPUAsmVariants.Default;
VOPProfile Pfl = P;
}
class VOP1_Real <VOP1_Pseudo ps, int EncodingFamily> :
InstSI <ps.OutOperandList, ps.InOperandList, ps.Mnemonic # ps.AsmOperands, []>,
SIMCInstr <ps.PseudoInstr, EncodingFamily> {
let isPseudo = 0;
let isCodeGenOnly = 0;
let Constraints = ps.Constraints;
let DisableEncoding = ps.DisableEncoding;
// copy relevant pseudo op flags
let SubtargetPredicate = ps.SubtargetPredicate;
let AsmMatchConverter = ps.AsmMatchConverter;
let AsmVariantName = ps.AsmVariantName;
let Constraints = ps.Constraints;
let DisableEncoding = ps.DisableEncoding;
let TSFlags = ps.TSFlags;
}
class VOP1_SDWA_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> :
VOP_SDWA_Pseudo <OpName, P, pattern> {
let AsmMatchConverter = "cvtSdwaVOP1";
}
class getVOP1Pat64 <SDPatternOperator node, VOPProfile P> : LetDummies {
list<dag> ret = !if(P.HasModifiers,
[(set P.DstVT:$vdst, (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0,
i32:$src0_modifiers, i1:$clamp, i32:$omod))))],
[(set P.DstVT:$vdst, (node P.Src0VT:$src0))]);
}
multiclass VOP1Inst <string opName, VOPProfile P,
SDPatternOperator node = null_frag> {
def _e32 : VOP1_Pseudo <opName, P>;
def _e64 : VOP3_Pseudo <opName, P, getVOP1Pat64<node, P>.ret>;
def _sdwa : VOP1_SDWA_Pseudo <opName, P>;
}
//===----------------------------------------------------------------------===//
// VOP1 Instructions
//===----------------------------------------------------------------------===//
let VOPAsmPrefer32Bit = 1 in {
defm V_NOP : VOP1Inst <"v_nop", VOP_NONE>;
}
let isMoveImm = 1, isReMaterializable = 1, isAsCheapAsAMove = 1 in {
defm V_MOV_B32 : VOP1Inst <"v_mov_b32", VOP_I32_I32>;
} // End isMoveImm = 1
// FIXME: Specify SchedRW for READFIRSTLANE_B32
// TODO: Make profile for this, there is VOP3 encoding also
def V_READFIRSTLANE_B32 :
InstSI <(outs SReg_32:$vdst),
(ins VGPR_32:$src0),
"v_readfirstlane_b32 $vdst, $src0",
[(set i32:$vdst, (int_amdgcn_readfirstlane i32:$src0))]>,
Enc32 {
let isCodeGenOnly = 0;
let UseNamedOperandTable = 1;
let Size = 4;
let mayLoad = 0;
let mayStore = 0;
let hasSideEffects = 0;
let SubtargetPredicate = isGCN;
let VOP1 = 1;
let VALU = 1;
let Uses = [EXEC];
let isConvergent = 1;
bits<8> vdst;
bits<9> src0;
let Inst{8-0} = src0;
let Inst{16-9} = 0x2;
let Inst{24-17} = vdst;
let Inst{31-25} = 0x3f; //encoding
}
let SchedRW = [WriteQuarterRate32] in {
defm V_CVT_I32_F64 : VOP1Inst <"v_cvt_i32_f64", VOP_I32_F64, fp_to_sint>;
defm V_CVT_F64_I32 : VOP1Inst <"v_cvt_f64_i32", VOP_F64_I32, sint_to_fp>;
defm V_CVT_F32_I32 : VOP1Inst <"v_cvt_f32_i32", VOP_F32_I32, sint_to_fp>;
defm V_CVT_F32_U32 : VOP1Inst <"v_cvt_f32_u32", VOP_F32_I32, uint_to_fp>;
defm V_CVT_U32_F32 : VOP1Inst <"v_cvt_u32_f32", VOP_I32_F32, fp_to_uint>;
defm V_CVT_I32_F32 : VOP1Inst <"v_cvt_i32_f32", VOP_I32_F32, fp_to_sint>;
defm V_CVT_F16_F32 : VOP1Inst <"v_cvt_f16_f32", VOP_I32_F32, fp_to_f16>;
defm V_CVT_F32_F16 : VOP1Inst <"v_cvt_f32_f16", VOP_F32_I32, f16_to_fp>;
defm V_CVT_RPI_I32_F32 : VOP1Inst <"v_cvt_rpi_i32_f32", VOP_I32_F32, cvt_rpi_i32_f32>;
defm V_CVT_FLR_I32_F32 : VOP1Inst <"v_cvt_flr_i32_f32", VOP_I32_F32, cvt_flr_i32_f32>;
defm V_CVT_OFF_F32_I4 : VOP1Inst <"v_cvt_off_f32_i4", VOP_F32_I32>;
defm V_CVT_F32_F64 : VOP1Inst <"v_cvt_f32_f64", VOP_F32_F64, fpround>;
defm V_CVT_F64_F32 : VOP1Inst <"v_cvt_f64_f32", VOP_F64_F32, fpextend>;
defm V_CVT_F32_UBYTE0 : VOP1Inst <"v_cvt_f32_ubyte0", VOP_F32_I32, AMDGPUcvt_f32_ubyte0>;
defm V_CVT_F32_UBYTE1 : VOP1Inst <"v_cvt_f32_ubyte1", VOP_F32_I32, AMDGPUcvt_f32_ubyte1>;
defm V_CVT_F32_UBYTE2 : VOP1Inst <"v_cvt_f32_ubyte2", VOP_F32_I32, AMDGPUcvt_f32_ubyte2>;
defm V_CVT_F32_UBYTE3 : VOP1Inst <"v_cvt_f32_ubyte3", VOP_F32_I32, AMDGPUcvt_f32_ubyte3>;
defm V_CVT_U32_F64 : VOP1Inst <"v_cvt_u32_f64", VOP_I32_F64, fp_to_uint>;
defm V_CVT_F64_U32 : VOP1Inst <"v_cvt_f64_u32", VOP_F64_I32, uint_to_fp>;
} // End SchedRW = [WriteQuarterRate32]
defm V_FRACT_F32 : VOP1Inst <"v_fract_f32", VOP_F32_F32, AMDGPUfract>;
defm V_TRUNC_F32 : VOP1Inst <"v_trunc_f32", VOP_F32_F32, ftrunc>;
defm V_CEIL_F32 : VOP1Inst <"v_ceil_f32", VOP_F32_F32, fceil>;
defm V_RNDNE_F32 : VOP1Inst <"v_rndne_f32", VOP_F32_F32, frint>;
defm V_FLOOR_F32 : VOP1Inst <"v_floor_f32", VOP_F32_F32, ffloor>;
defm V_EXP_F32 : VOP1Inst <"v_exp_f32", VOP_F32_F32, fexp2>;
let SchedRW = [WriteQuarterRate32] in {
defm V_LOG_F32 : VOP1Inst <"v_log_f32", VOP_F32_F32, flog2>;
defm V_RCP_F32 : VOP1Inst <"v_rcp_f32", VOP_F32_F32, AMDGPUrcp>;
defm V_RCP_IFLAG_F32 : VOP1Inst <"v_rcp_iflag_f32", VOP_F32_F32>;
defm V_RSQ_F32 : VOP1Inst <"v_rsq_f32", VOP_F32_F32, AMDGPUrsq>;
} // End SchedRW = [WriteQuarterRate32]
let SchedRW = [WriteDouble] in {
defm V_RCP_F64 : VOP1Inst <"v_rcp_f64", VOP_F64_F64, AMDGPUrcp>;
defm V_RSQ_F64 : VOP1Inst <"v_rsq_f64", VOP_F64_F64, AMDGPUrsq>;
} // End SchedRW = [WriteDouble];
defm V_SQRT_F32 : VOP1Inst <"v_sqrt_f32", VOP_F32_F32, fsqrt>;
let SchedRW = [WriteDouble] in {
defm V_SQRT_F64 : VOP1Inst <"v_sqrt_f64", VOP_F64_F64, fsqrt>;
} // End SchedRW = [WriteDouble]
let SchedRW = [WriteQuarterRate32] in {
defm V_SIN_F32 : VOP1Inst <"v_sin_f32", VOP_F32_F32, AMDGPUsin>;
defm V_COS_F32 : VOP1Inst <"v_cos_f32", VOP_F32_F32, AMDGPUcos>;
} // End SchedRW = [WriteQuarterRate32]
defm V_NOT_B32 : VOP1Inst <"v_not_b32", VOP_I32_I32>;
defm V_BFREV_B32 : VOP1Inst <"v_bfrev_b32", VOP_I32_I32>;
defm V_FFBH_U32 : VOP1Inst <"v_ffbh_u32", VOP_I32_I32>;
defm V_FFBL_B32 : VOP1Inst <"v_ffbl_b32", VOP_I32_I32>;
defm V_FFBH_I32 : VOP1Inst <"v_ffbh_i32", VOP_I32_I32>;
defm V_FREXP_EXP_I32_F64 : VOP1Inst <"v_frexp_exp_i32_f64", VOP_I32_F64, int_amdgcn_frexp_exp>;
let SchedRW = [WriteDoubleAdd] in {
defm V_FREXP_MANT_F64 : VOP1Inst <"v_frexp_mant_f64", VOP_F64_F64, int_amdgcn_frexp_mant>;
defm V_FRACT_F64 : VOP1Inst <"v_fract_f64", VOP_F64_F64, AMDGPUfract>;
} // End SchedRW = [WriteDoubleAdd]
defm V_FREXP_EXP_I32_F32 : VOP1Inst <"v_frexp_exp_i32_f32", VOP_I32_F32, int_amdgcn_frexp_exp>;
defm V_FREXP_MANT_F32 : VOP1Inst <"v_frexp_mant_f32", VOP_F32_F32, int_amdgcn_frexp_mant>;
let VOPAsmPrefer32Bit = 1 in {
defm V_CLREXCP : VOP1Inst <"v_clrexcp", VOP_NO_EXT<VOP_NONE>>;
}
// Restrict src0 to be VGPR
def VOP_I32_VI32_NO_EXT : VOPProfile<[i32, i32, untyped, untyped]> {
let Src0RC32 = VRegSrc_32;
let Src0RC64 = VRegSrc_32;
let HasExt = 0;
}
// Special case because there are no true output operands. Hack vdst
// to be a src operand. The custom inserter must add a tied implicit
// def and use of the super register since there seems to be no way to
// add an implicit def of a virtual register in tablegen.
def VOP_MOVRELD : VOPProfile<[untyped, i32, untyped, untyped]> {
let Src0RC32 = VOPDstOperand<VGPR_32>;
let Src0RC64 = VOPDstOperand<VGPR_32>;
let Outs = (outs);
let Ins32 = (ins Src0RC32:$vdst, VSrc_b32:$src0);
let Ins64 = (ins Src0RC64:$vdst, VSrc_b32:$src0);
let InsDPP = (ins Src0RC32:$vdst, Src0RC32:$src0, dpp_ctrl:$dpp_ctrl, row_mask:$row_mask,
bank_mask:$bank_mask, bound_ctrl:$bound_ctrl);
let InsSDWA = (ins Src0RC32:$vdst, Src0ModSDWA:$src0_modifiers, VCSrc_b32:$src0,
clampmod:$clamp, dst_sel:$dst_sel, dst_unused:$dst_unused,
src0_sel:$src0_sel);
let Asm32 = getAsm32<1, 1>.ret;
let Asm64 = getAsm64<1, 1, 0>.ret;
let AsmDPP = getAsmDPP<1, 1, 0>.ret;
let AsmSDWA = getAsmSDWA<1, 1, 0>.ret;
let HasExt = 0;
let HasDst = 0;
let EmitDst = 1; // force vdst emission
}
let SubtargetPredicate = HasMovrel, Uses = [M0, EXEC] in {
// v_movreld_b32 is a special case because the destination output
// register is really a source. It isn't actually read (but may be
// written), and is only to provide the base register to start
// indexing from. Tablegen seems to not let you define an implicit
// virtual register output for the super register being written into,
// so this must have an implicit def of the register added to it.
defm V_MOVRELD_B32 : VOP1Inst <"v_movreld_b32", VOP_MOVRELD>;
defm V_MOVRELS_B32 : VOP1Inst <"v_movrels_b32", VOP_I32_VI32_NO_EXT>;
defm V_MOVRELSD_B32 : VOP1Inst <"v_movrelsd_b32", VOP_NO_EXT<VOP_I32_I32>>;
} // End Uses = [M0, EXEC]
// These instruction only exist on SI and CI
let SubtargetPredicate = isSICI in {
let SchedRW = [WriteQuarterRate32] in {
defm V_MOV_FED_B32 : VOP1Inst <"v_mov_fed_b32", VOP_I32_I32>;
defm V_LOG_CLAMP_F32 : VOP1Inst <"v_log_clamp_f32", VOP_F32_F32, int_amdgcn_log_clamp>;
defm V_RCP_CLAMP_F32 : VOP1Inst <"v_rcp_clamp_f32", VOP_F32_F32>;
defm V_RCP_LEGACY_F32 : VOP1Inst <"v_rcp_legacy_f32", VOP_F32_F32, AMDGPUrcp_legacy>;
defm V_RSQ_CLAMP_F32 : VOP1Inst <"v_rsq_clamp_f32", VOP_F32_F32, AMDGPUrsq_clamp>;
defm V_RSQ_LEGACY_F32 : VOP1Inst <"v_rsq_legacy_f32", VOP_F32_F32, AMDGPUrsq_legacy>;
} // End SchedRW = [WriteQuarterRate32]
let SchedRW = [WriteDouble] in {
defm V_RCP_CLAMP_F64 : VOP1Inst <"v_rcp_clamp_f64", VOP_F64_F64>;
defm V_RSQ_CLAMP_F64 : VOP1Inst <"v_rsq_clamp_f64", VOP_F64_F64, AMDGPUrsq_clamp>;
} // End SchedRW = [WriteDouble]
} // End SubtargetPredicate = isSICI
let SubtargetPredicate = isCIVI in {
let SchedRW = [WriteDoubleAdd] in {
defm V_TRUNC_F64 : VOP1Inst <"v_trunc_f64", VOP_F64_F64, ftrunc>;
defm V_CEIL_F64 : VOP1Inst <"v_ceil_f64", VOP_F64_F64, fceil>;
defm V_FLOOR_F64 : VOP1Inst <"v_floor_f64", VOP_F64_F64, ffloor>;
defm V_RNDNE_F64 : VOP1Inst <"v_rndne_f64", VOP_F64_F64, frint>;
} // End SchedRW = [WriteDoubleAdd]
let SchedRW = [WriteQuarterRate32] in {
defm V_LOG_LEGACY_F32 : VOP1Inst <"v_log_legacy_f32", VOP_F32_F32>;
defm V_EXP_LEGACY_F32 : VOP1Inst <"v_exp_legacy_f32", VOP_F32_F32>;
} // End SchedRW = [WriteQuarterRate32]
} // End SubtargetPredicate = isCIVI
let SubtargetPredicate = isVI in {
defm V_CVT_F16_U16 : VOP1Inst <"v_cvt_f16_u16", VOP_F16_I16, uint_to_fp>;
defm V_CVT_F16_I16 : VOP1Inst <"v_cvt_f16_i16", VOP_F16_I16, sint_to_fp>;
defm V_CVT_U16_F16 : VOP1Inst <"v_cvt_u16_f16", VOP_I16_F16, fp_to_uint>;
defm V_CVT_I16_F16 : VOP1Inst <"v_cvt_i16_f16", VOP_I16_F16, fp_to_sint>;
defm V_RCP_F16 : VOP1Inst <"v_rcp_f16", VOP_F16_F16, AMDGPUrcp>;
defm V_SQRT_F16 : VOP1Inst <"v_sqrt_f16", VOP_F16_F16, fsqrt>;
defm V_RSQ_F16 : VOP1Inst <"v_rsq_f16", VOP_F16_F16, AMDGPUrsq>;
defm V_LOG_F16 : VOP1Inst <"v_log_f16", VOP_F16_F16, flog2>;
defm V_EXP_F16 : VOP1Inst <"v_exp_f16", VOP_F16_F16, fexp2>;
defm V_FREXP_MANT_F16 : VOP1Inst <"v_frexp_mant_f16", VOP_F16_F16, int_amdgcn_frexp_mant>;
defm V_FREXP_EXP_I16_F16 : VOP1Inst <"v_frexp_exp_i16_f16", VOP_I16_F16, int_amdgcn_frexp_exp>;
defm V_FLOOR_F16 : VOP1Inst <"v_floor_f16", VOP_F16_F16, ffloor>;
defm V_CEIL_F16 : VOP1Inst <"v_ceil_f16", VOP_F16_F16, fceil>;
defm V_TRUNC_F16 : VOP1Inst <"v_trunc_f16", VOP_F16_F16, ftrunc>;
defm V_RNDNE_F16 : VOP1Inst <"v_rndne_f16", VOP_F16_F16, frint>;
defm V_FRACT_F16 : VOP1Inst <"v_fract_f16", VOP_F16_F16, AMDGPUfract>;
defm V_SIN_F16 : VOP1Inst <"v_sin_f16", VOP_F16_F16, AMDGPUsin>;
defm V_COS_F16 : VOP1Inst <"v_cos_f16", VOP_F16_F16, AMDGPUcos>;
}
let Predicates = [isVI] in {
def : Pat<
(f32 (f16_to_fp i16:$src)),
(V_CVT_F32_F16_e32 $src)
>;
def : Pat<
(i16 (fp_to_f16 f32:$src)),
(V_CVT_F16_F32_e32 $src)
>;
}
//===----------------------------------------------------------------------===//
// Target
//===----------------------------------------------------------------------===//
//===----------------------------------------------------------------------===//
// SI
//===----------------------------------------------------------------------===//
multiclass VOP1_Real_si <bits<9> op> {
let AssemblerPredicates = [isSICI], DecoderNamespace = "SICI" in {
def _e32_si :
VOP1_Real<!cast<VOP1_Pseudo>(NAME#"_e32"), SIEncodingFamily.SI>,
VOP1e<op{7-0}, !cast<VOP1_Pseudo>(NAME#"_e32").Pfl>;
def _e64_si :
VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.SI>,
VOP3e_si <{1, 1, op{6-0}}, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl>;
}
}
defm V_NOP : VOP1_Real_si <0x0>;
defm V_MOV_B32 : VOP1_Real_si <0x1>;
defm V_CVT_I32_F64 : VOP1_Real_si <0x3>;
defm V_CVT_F64_I32 : VOP1_Real_si <0x4>;
defm V_CVT_F32_I32 : VOP1_Real_si <0x5>;
defm V_CVT_F32_U32 : VOP1_Real_si <0x6>;
defm V_CVT_U32_F32 : VOP1_Real_si <0x7>;
defm V_CVT_I32_F32 : VOP1_Real_si <0x8>;
defm V_MOV_FED_B32 : VOP1_Real_si <0x9>;
defm V_CVT_F16_F32 : VOP1_Real_si <0xa>;
defm V_CVT_F32_F16 : VOP1_Real_si <0xb>;
defm V_CVT_RPI_I32_F32 : VOP1_Real_si <0xc>;
defm V_CVT_FLR_I32_F32 : VOP1_Real_si <0xd>;
defm V_CVT_OFF_F32_I4 : VOP1_Real_si <0xe>;
defm V_CVT_F32_F64 : VOP1_Real_si <0xf>;
defm V_CVT_F64_F32 : VOP1_Real_si <0x10>;
defm V_CVT_F32_UBYTE0 : VOP1_Real_si <0x11>;
defm V_CVT_F32_UBYTE1 : VOP1_Real_si <0x12>;
defm V_CVT_F32_UBYTE2 : VOP1_Real_si <0x13>;
defm V_CVT_F32_UBYTE3 : VOP1_Real_si <0x14>;
defm V_CVT_U32_F64 : VOP1_Real_si <0x15>;
defm V_CVT_F64_U32 : VOP1_Real_si <0x16>;
defm V_FRACT_F32 : VOP1_Real_si <0x20>;
defm V_TRUNC_F32 : VOP1_Real_si <0x21>;
defm V_CEIL_F32 : VOP1_Real_si <0x22>;
defm V_RNDNE_F32 : VOP1_Real_si <0x23>;
defm V_FLOOR_F32 : VOP1_Real_si <0x24>;
defm V_EXP_F32 : VOP1_Real_si <0x25>;
defm V_LOG_CLAMP_F32 : VOP1_Real_si <0x26>;
defm V_LOG_F32 : VOP1_Real_si <0x27>;
defm V_RCP_CLAMP_F32 : VOP1_Real_si <0x28>;
defm V_RCP_LEGACY_F32 : VOP1_Real_si <0x29>;
defm V_RCP_F32 : VOP1_Real_si <0x2a>;
defm V_RCP_IFLAG_F32 : VOP1_Real_si <0x2b>;
defm V_RSQ_CLAMP_F32 : VOP1_Real_si <0x2c>;
defm V_RSQ_LEGACY_F32 : VOP1_Real_si <0x2d>;
defm V_RSQ_F32 : VOP1_Real_si <0x2e>;
defm V_RCP_F64 : VOP1_Real_si <0x2f>;
defm V_RCP_CLAMP_F64 : VOP1_Real_si <0x30>;
defm V_RSQ_F64 : VOP1_Real_si <0x31>;
defm V_RSQ_CLAMP_F64 : VOP1_Real_si <0x32>;
defm V_SQRT_F32 : VOP1_Real_si <0x33>;
defm V_SQRT_F64 : VOP1_Real_si <0x34>;
defm V_SIN_F32 : VOP1_Real_si <0x35>;
defm V_COS_F32 : VOP1_Real_si <0x36>;
defm V_NOT_B32 : VOP1_Real_si <0x37>;
defm V_BFREV_B32 : VOP1_Real_si <0x38>;
defm V_FFBH_U32 : VOP1_Real_si <0x39>;
defm V_FFBL_B32 : VOP1_Real_si <0x3a>;
defm V_FFBH_I32 : VOP1_Real_si <0x3b>;
defm V_FREXP_EXP_I32_F64 : VOP1_Real_si <0x3c>;
defm V_FREXP_MANT_F64 : VOP1_Real_si <0x3d>;
defm V_FRACT_F64 : VOP1_Real_si <0x3e>;
defm V_FREXP_EXP_I32_F32 : VOP1_Real_si <0x3f>;
defm V_FREXP_MANT_F32 : VOP1_Real_si <0x40>;
defm V_CLREXCP : VOP1_Real_si <0x41>;
defm V_MOVRELD_B32 : VOP1_Real_si <0x42>;
defm V_MOVRELS_B32 : VOP1_Real_si <0x43>;
defm V_MOVRELSD_B32 : VOP1_Real_si <0x44>;
//===----------------------------------------------------------------------===//
// CI
//===----------------------------------------------------------------------===//
multiclass VOP1_Real_ci <bits<9> op> {
let AssemblerPredicates = [isCIOnly], DecoderNamespace = "CI" in {
def _e32_ci :
VOP1_Real<!cast<VOP1_Pseudo>(NAME#"_e32"), SIEncodingFamily.SI>,
VOP1e<op{7-0}, !cast<VOP1_Pseudo>(NAME#"_e32").Pfl>;
def _e64_ci :
VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.SI>,
VOP3e_si <{1, 1, op{6-0}}, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl>;
}
}
defm V_TRUNC_F64 : VOP1_Real_ci <0x17>;
defm V_CEIL_F64 : VOP1_Real_ci <0x18>;
defm V_FLOOR_F64 : VOP1_Real_ci <0x1A>;
defm V_RNDNE_F64 : VOP1_Real_ci <0x19>;
defm V_LOG_LEGACY_F32 : VOP1_Real_ci <0x45>;
defm V_EXP_LEGACY_F32 : VOP1_Real_ci <0x46>;
//===----------------------------------------------------------------------===//
// VI
//===----------------------------------------------------------------------===//
class VOP1_DPP <bits<8> op, VOP1_Pseudo ps, VOPProfile P = ps.Pfl> :
VOP_DPP <ps.OpName, P> {
let Defs = ps.Defs;
let Uses = ps.Uses;
let SchedRW = ps.SchedRW;
let hasSideEffects = ps.hasSideEffects;
let Constraints = ps.Constraints;
let DisableEncoding = ps.DisableEncoding;
bits<8> vdst;
let Inst{8-0} = 0xfa; // dpp
let Inst{16-9} = op;
let Inst{24-17} = !if(P.EmitDst, vdst{7-0}, 0);
let Inst{31-25} = 0x3f; //encoding
}
multiclass VOP1_Real_vi <bits<10> op> {
let AssemblerPredicates = [isVI], DecoderNamespace = "VI" in {
def _e32_vi :
VOP1_Real<!cast<VOP1_Pseudo>(NAME#"_e32"), SIEncodingFamily.VI>,
VOP1e<op{7-0}, !cast<VOP1_Pseudo>(NAME#"_e32").Pfl>;
def _e64_vi :
VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.VI>,
VOP3e_vi <!add(0x140, op), !cast<VOP3_Pseudo>(NAME#"_e64").Pfl>;
}
def _sdwa_vi :
VOP_SDWA_Real <!cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa")>,
VOP1_SDWAe <op{7-0}, !cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa").Pfl>;
// For now left dpp only for asm/dasm
// TODO: add corresponding pseudo
def _dpp : VOP1_DPP<op{7-0}, !cast<VOP1_Pseudo>(NAME#"_e32")>;
}
defm V_NOP : VOP1_Real_vi <0x0>;
defm V_MOV_B32 : VOP1_Real_vi <0x1>;
defm V_CVT_I32_F64 : VOP1_Real_vi <0x3>;
defm V_CVT_F64_I32 : VOP1_Real_vi <0x4>;
defm V_CVT_F32_I32 : VOP1_Real_vi <0x5>;
defm V_CVT_F32_U32 : VOP1_Real_vi <0x6>;
defm V_CVT_U32_F32 : VOP1_Real_vi <0x7>;
defm V_CVT_I32_F32 : VOP1_Real_vi <0x8>;
defm V_CVT_F16_F32 : VOP1_Real_vi <0xa>;
defm V_CVT_F32_F16 : VOP1_Real_vi <0xb>;
defm V_CVT_RPI_I32_F32 : VOP1_Real_vi <0xc>;
defm V_CVT_FLR_I32_F32 : VOP1_Real_vi <0xd>;
defm V_CVT_OFF_F32_I4 : VOP1_Real_vi <0xe>;
defm V_CVT_F32_F64 : VOP1_Real_vi <0xf>;
defm V_CVT_F64_F32 : VOP1_Real_vi <0x10>;
defm V_CVT_F32_UBYTE0 : VOP1_Real_vi <0x11>;
defm V_CVT_F32_UBYTE1 : VOP1_Real_vi <0x12>;
defm V_CVT_F32_UBYTE2 : VOP1_Real_vi <0x13>;
defm V_CVT_F32_UBYTE3 : VOP1_Real_vi <0x14>;
defm V_CVT_U32_F64 : VOP1_Real_vi <0x15>;
defm V_CVT_F64_U32 : VOP1_Real_vi <0x16>;
defm V_FRACT_F32 : VOP1_Real_vi <0x1b>;
defm V_TRUNC_F32 : VOP1_Real_vi <0x1c>;
defm V_CEIL_F32 : VOP1_Real_vi <0x1d>;
defm V_RNDNE_F32 : VOP1_Real_vi <0x1e>;
defm V_FLOOR_F32 : VOP1_Real_vi <0x1f>;
defm V_EXP_F32 : VOP1_Real_vi <0x20>;
defm V_LOG_F32 : VOP1_Real_vi <0x21>;
defm V_RCP_F32 : VOP1_Real_vi <0x22>;
defm V_RCP_IFLAG_F32 : VOP1_Real_vi <0x23>;
defm V_RSQ_F32 : VOP1_Real_vi <0x24>;
defm V_RCP_F64 : VOP1_Real_vi <0x25>;
defm V_RSQ_F64 : VOP1_Real_vi <0x26>;
defm V_SQRT_F32 : VOP1_Real_vi <0x27>;
defm V_SQRT_F64 : VOP1_Real_vi <0x28>;
defm V_SIN_F32 : VOP1_Real_vi <0x29>;
defm V_COS_F32 : VOP1_Real_vi <0x2a>;
defm V_NOT_B32 : VOP1_Real_vi <0x2b>;
defm V_BFREV_B32 : VOP1_Real_vi <0x2c>;
defm V_FFBH_U32 : VOP1_Real_vi <0x2d>;
defm V_FFBL_B32 : VOP1_Real_vi <0x2e>;
defm V_FFBH_I32 : VOP1_Real_vi <0x2f>;
defm V_FREXP_EXP_I32_F64 : VOP1_Real_vi <0x30>;
defm V_FREXP_MANT_F64 : VOP1_Real_vi <0x31>;
defm V_FRACT_F64 : VOP1_Real_vi <0x32>;
defm V_FREXP_EXP_I32_F32 : VOP1_Real_vi <0x33>;
defm V_FREXP_MANT_F32 : VOP1_Real_vi <0x34>;
defm V_CLREXCP : VOP1_Real_vi <0x35>;
defm V_MOVRELD_B32 : VOP1_Real_vi <0x36>;
defm V_MOVRELS_B32 : VOP1_Real_vi <0x37>;
defm V_MOVRELSD_B32 : VOP1_Real_vi <0x38>;
defm V_TRUNC_F64 : VOP1_Real_vi <0x17>;
defm V_CEIL_F64 : VOP1_Real_vi <0x18>;
defm V_FLOOR_F64 : VOP1_Real_vi <0x1A>;
defm V_RNDNE_F64 : VOP1_Real_vi <0x19>;
defm V_LOG_LEGACY_F32 : VOP1_Real_vi <0x4c>;
defm V_EXP_LEGACY_F32 : VOP1_Real_vi <0x4b>;
defm V_CVT_F16_U16 : VOP1_Real_vi <0x39>;
defm V_CVT_F16_I16 : VOP1_Real_vi <0x3a>;
defm V_CVT_U16_F16 : VOP1_Real_vi <0x3b>;
defm V_CVT_I16_F16 : VOP1_Real_vi <0x3c>;
defm V_RCP_F16 : VOP1_Real_vi <0x3d>;
defm V_SQRT_F16 : VOP1_Real_vi <0x3e>;
defm V_RSQ_F16 : VOP1_Real_vi <0x3f>;
defm V_LOG_F16 : VOP1_Real_vi <0x40>;
defm V_EXP_F16 : VOP1_Real_vi <0x41>;
defm V_FREXP_MANT_F16 : VOP1_Real_vi <0x42>;
defm V_FREXP_EXP_I16_F16 : VOP1_Real_vi <0x43>;
defm V_FLOOR_F16 : VOP1_Real_vi <0x44>;
defm V_CEIL_F16 : VOP1_Real_vi <0x45>;
defm V_TRUNC_F16 : VOP1_Real_vi <0x46>;
defm V_RNDNE_F16 : VOP1_Real_vi <0x47>;
defm V_FRACT_F16 : VOP1_Real_vi <0x48>;
defm V_SIN_F16 : VOP1_Real_vi <0x49>;
defm V_COS_F16 : VOP1_Real_vi <0x4a>;
// Copy of v_mov_b32 with $vdst as a use operand for use with VGPR
// indexing mode. vdst can't be treated as a def for codegen purposes,
// and an implicit use and def of the super register should be added.
def V_MOV_B32_indirect : VPseudoInstSI<(outs),
(ins getVALUDstForVT<i32>.ret:$vdst, getVOPSrc0ForVT<i32>.ret:$src0)>,
PseudoInstExpansion<(V_MOV_B32_e32_vi getVALUDstForVT<i32>.ret:$vdst,
getVOPSrc0ForVT<i32>.ret:$src0)> {
let VOP1 = 1;
let SubtargetPredicate = isVI;
}
// This is a pseudo variant of the v_movreld_b32 instruction in which the
// vector operand appears only twice, once as def and once as use. Using this
// pseudo avoids problems with the Two Address instructions pass.
class V_MOVRELD_B32_pseudo<RegisterClass rc> : VPseudoInstSI <
(outs rc:$vdst),
(ins rc:$vsrc, VSrc_b32:$val, i32imm:$offset)> {
let VOP1 = 1;
let Constraints = "$vsrc = $vdst";
let Uses = [M0, EXEC];
let SubtargetPredicate = HasMovrel;
}
def V_MOVRELD_B32_V1 : V_MOVRELD_B32_pseudo<VGPR_32>;
def V_MOVRELD_B32_V2 : V_MOVRELD_B32_pseudo<VReg_64>;
def V_MOVRELD_B32_V4 : V_MOVRELD_B32_pseudo<VReg_128>;
def V_MOVRELD_B32_V8 : V_MOVRELD_B32_pseudo<VReg_256>;
def V_MOVRELD_B32_V16 : V_MOVRELD_B32_pseudo<VReg_512>;
let Predicates = [isVI] in {
def : Pat <
(i32 (int_amdgcn_mov_dpp i32:$src, imm:$dpp_ctrl, imm:$row_mask, imm:$bank_mask,
imm:$bound_ctrl)),
(V_MOV_B32_dpp $src, (as_i32imm $dpp_ctrl), (as_i32imm $row_mask),
(as_i32imm $bank_mask), (as_i1imm $bound_ctrl))
>;
def : Pat<
(i32 (anyext i16:$src)),
(COPY $src)
>;
def : Pat<
(i64 (anyext i16:$src)),
(REG_SEQUENCE VReg_64,
(i32 (COPY $src)), sub0,
(V_MOV_B32_e32 (i32 0)), sub1)
>;
def : Pat<
(i16 (trunc i32:$src)),
(COPY $src)
>;
-def : Pat<
- (i1 (trunc i16:$src)),
- (COPY $src)
->;
-
-
def : Pat <
(i16 (trunc i64:$src)),
(EXTRACT_SUBREG $src, sub0)
>;
} // End Predicates = [isVI]
Index: projects/clang400-import/contrib/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
===================================================================
--- projects/clang400-import/contrib/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp (revision 314268)
+++ projects/clang400-import/contrib/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp (revision 314269)
@@ -1,577 +1,582 @@
//===- CorrelatedValuePropagation.cpp - Propagate CFG-derived info --------===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
// This file implements the Correlated Value Propagation pass.
//
//===----------------------------------------------------------------------===//
#include "llvm/Transforms/Scalar/CorrelatedValuePropagation.h"
#include "llvm/Transforms/Scalar.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/GlobalsModRef.h"
#include "llvm/Analysis/InstructionSimplify.h"
#include "llvm/Analysis/LazyValueInfo.h"
#include "llvm/IR/CFG.h"
#include "llvm/IR/ConstantRange.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/Module.h"
#include "llvm/Pass.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Utils/Local.h"
using namespace llvm;
#define DEBUG_TYPE "correlated-value-propagation"
STATISTIC(NumPhis, "Number of phis propagated");
STATISTIC(NumSelects, "Number of selects propagated");
STATISTIC(NumMemAccess, "Number of memory access targets propagated");
STATISTIC(NumCmps, "Number of comparisons propagated");
STATISTIC(NumReturns, "Number of return values propagated");
STATISTIC(NumDeadCases, "Number of switch cases removed");
STATISTIC(NumSDivs, "Number of sdiv converted to udiv");
STATISTIC(NumAShrs, "Number of ashr converted to lshr");
STATISTIC(NumSRems, "Number of srem converted to urem");
+static cl::opt<bool> DontProcessAdds("cvp-dont-process-adds", cl::init(true));
+
namespace {
class CorrelatedValuePropagation : public FunctionPass {
public:
static char ID;
CorrelatedValuePropagation(): FunctionPass(ID) {
initializeCorrelatedValuePropagationPass(*PassRegistry::getPassRegistry());
}
bool runOnFunction(Function &F) override;
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<LazyValueInfoWrapperPass>();
AU.addPreserved<GlobalsAAWrapperPass>();
}
};
}
char CorrelatedValuePropagation::ID = 0;
INITIALIZE_PASS_BEGIN(CorrelatedValuePropagation, "correlated-propagation",
"Value Propagation", false, false)
INITIALIZE_PASS_DEPENDENCY(LazyValueInfoWrapperPass)
INITIALIZE_PASS_END(CorrelatedValuePropagation, "correlated-propagation",
"Value Propagation", false, false)
// Public interface to the Value Propagation pass
Pass *llvm::createCorrelatedValuePropagationPass() {
return new CorrelatedValuePropagation();
}
static bool processSelect(SelectInst *S, LazyValueInfo *LVI) {
if (S->getType()->isVectorTy()) return false;
if (isa<Constant>(S->getOperand(0))) return false;
Constant *C = LVI->getConstant(S->getOperand(0), S->getParent(), S);
if (!C) return false;
ConstantInt *CI = dyn_cast<ConstantInt>(C);
if (!CI) return false;
Value *ReplaceWith = S->getOperand(1);
Value *Other = S->getOperand(2);
if (!CI->isOne()) std::swap(ReplaceWith, Other);
if (ReplaceWith == S) ReplaceWith = UndefValue::get(S->getType());
S->replaceAllUsesWith(ReplaceWith);
S->eraseFromParent();
++NumSelects;
return true;
}
static bool processPHI(PHINode *P, LazyValueInfo *LVI) {
bool Changed = false;
BasicBlock *BB = P->getParent();
for (unsigned i = 0, e = P->getNumIncomingValues(); i < e; ++i) {
Value *Incoming = P->getIncomingValue(i);
if (isa<Constant>(Incoming)) continue;
Value *V = LVI->getConstantOnEdge(Incoming, P->getIncomingBlock(i), BB, P);
// Look if the incoming value is a select with a scalar condition for which
// LVI can tells us the value. In that case replace the incoming value with
// the appropriate value of the select. This often allows us to remove the
// select later.
if (!V) {
SelectInst *SI = dyn_cast<SelectInst>(Incoming);
if (!SI) continue;
Value *Condition = SI->getCondition();
if (!Condition->getType()->isVectorTy()) {
if (Constant *C = LVI->getConstantOnEdge(
Condition, P->getIncomingBlock(i), BB, P)) {
if (C->isOneValue()) {
V = SI->getTrueValue();
} else if (C->isZeroValue()) {
V = SI->getFalseValue();
}
// Once LVI learns to handle vector types, we could also add support
// for vector type constants that are not all zeroes or all ones.
}
}
// Look if the select has a constant but LVI tells us that the incoming
// value can never be that constant. In that case replace the incoming
// value with the other value of the select. This often allows us to
// remove the select later.
if (!V) {
Constant *C = dyn_cast<Constant>(SI->getFalseValue());
if (!C) continue;
if (LVI->getPredicateOnEdge(ICmpInst::ICMP_EQ, SI, C,
P->getIncomingBlock(i), BB, P) !=
LazyValueInfo::False)
continue;
V = SI->getTrueValue();
}
DEBUG(dbgs() << "CVP: Threading PHI over " << *SI << '\n');
}
P->setIncomingValue(i, V);
Changed = true;
}
// FIXME: Provide TLI, DT, AT to SimplifyInstruction.
const DataLayout &DL = BB->getModule()->getDataLayout();
if (Value *V = SimplifyInstruction(P, DL)) {
P->replaceAllUsesWith(V);
P->eraseFromParent();
Changed = true;
}
if (Changed)
++NumPhis;
return Changed;
}
static bool processMemAccess(Instruction *I, LazyValueInfo *LVI) {
Value *Pointer = nullptr;
if (LoadInst *L = dyn_cast<LoadInst>(I))
Pointer = L->getPointerOperand();
else
Pointer = cast<StoreInst>(I)->getPointerOperand();
if (isa<Constant>(Pointer)) return false;
Constant *C = LVI->getConstant(Pointer, I->getParent(), I);
if (!C) return false;
++NumMemAccess;
I->replaceUsesOfWith(Pointer, C);
return true;
}
/// See if LazyValueInfo's ability to exploit edge conditions or range
/// information is sufficient to prove this comparison. Even for local
/// conditions, this can sometimes prove conditions instcombine can't by
/// exploiting range information.
static bool processCmp(CmpInst *C, LazyValueInfo *LVI) {
Value *Op0 = C->getOperand(0);
Constant *Op1 = dyn_cast<Constant>(C->getOperand(1));
if (!Op1) return false;
// As a policy choice, we choose not to waste compile time on anything where
// the comparison is testing local values. While LVI can sometimes reason
// about such cases, it's not its primary purpose. We do make sure to do
// the block local query for uses from terminator instructions, but that's
// handled in the code for each terminator.
auto *I = dyn_cast<Instruction>(Op0);
if (I && I->getParent() == C->getParent())
return false;
LazyValueInfo::Tristate Result =
LVI->getPredicateAt(C->getPredicate(), Op0, Op1, C);
if (Result == LazyValueInfo::Unknown) return false;
++NumCmps;
if (Result == LazyValueInfo::True)
C->replaceAllUsesWith(ConstantInt::getTrue(C->getContext()));
else
C->replaceAllUsesWith(ConstantInt::getFalse(C->getContext()));
C->eraseFromParent();
return true;
}
/// Simplify a switch instruction by removing cases which can never fire. If the
/// uselessness of a case could be determined locally then constant propagation
/// would already have figured it out. Instead, walk the predecessors and
/// statically evaluate cases based on information available on that edge. Cases
/// that cannot fire no matter what the incoming edge can safely be removed. If
/// a case fires on every incoming edge then the entire switch can be removed
/// and replaced with a branch to the case destination.
static bool processSwitch(SwitchInst *SI, LazyValueInfo *LVI) {
Value *Cond = SI->getCondition();
BasicBlock *BB = SI->getParent();
// If the condition was defined in same block as the switch then LazyValueInfo
// currently won't say anything useful about it, though in theory it could.
if (isa<Instruction>(Cond) && cast<Instruction>(Cond)->getParent() == BB)
return false;
// If the switch is unreachable then trying to improve it is a waste of time.
pred_iterator PB = pred_begin(BB), PE = pred_end(BB);
if (PB == PE) return false;
// Analyse each switch case in turn. This is done in reverse order so that
// removing a case doesn't cause trouble for the iteration.
bool Changed = false;
for (SwitchInst::CaseIt CI = SI->case_end(), CE = SI->case_begin(); CI-- != CE;
) {
ConstantInt *Case = CI.getCaseValue();
// Check to see if the switch condition is equal to/not equal to the case
// value on every incoming edge, equal/not equal being the same each time.
LazyValueInfo::Tristate State = LazyValueInfo::Unknown;
for (pred_iterator PI = PB; PI != PE; ++PI) {
// Is the switch condition equal to the case value?
LazyValueInfo::Tristate Value = LVI->getPredicateOnEdge(CmpInst::ICMP_EQ,
Cond, Case, *PI,
BB, SI);
// Give up on this case if nothing is known.
if (Value == LazyValueInfo::Unknown) {
State = LazyValueInfo::Unknown;
break;
}
// If this was the first edge to be visited, record that all other edges
// need to give the same result.
if (PI == PB) {
State = Value;
continue;
}
// If this case is known to fire for some edges and known not to fire for
// others then there is nothing we can do - give up.
if (Value != State) {
State = LazyValueInfo::Unknown;
break;
}
}
if (State == LazyValueInfo::False) {
// This case never fires - remove it.
CI.getCaseSuccessor()->removePredecessor(BB);
SI->removeCase(CI); // Does not invalidate the iterator.
// The condition can be modified by removePredecessor's PHI simplification
// logic.
Cond = SI->getCondition();
++NumDeadCases;
Changed = true;
} else if (State == LazyValueInfo::True) {
// This case always fires. Arrange for the switch to be turned into an
// unconditional branch by replacing the switch condition with the case
// value.
SI->setCondition(Case);
NumDeadCases += SI->getNumCases();
Changed = true;
break;
}
}
if (Changed)
// If the switch has been simplified to the point where it can be replaced
// by a branch then do so now.
ConstantFoldTerminator(BB);
return Changed;
}
/// Infer nonnull attributes for the arguments at the specified callsite.
static bool processCallSite(CallSite CS, LazyValueInfo *LVI) {
SmallVector<unsigned, 4> Indices;
unsigned ArgNo = 0;
for (Value *V : CS.args()) {
PointerType *Type = dyn_cast<PointerType>(V->getType());
// Try to mark pointer typed parameters as non-null. We skip the
// relatively expensive analysis for constants which are obviously either
// null or non-null to start with.
if (Type && !CS.paramHasAttr(ArgNo + 1, Attribute::NonNull) &&
!isa<Constant>(V) &&
LVI->getPredicateAt(ICmpInst::ICMP_EQ, V,
ConstantPointerNull::get(Type),
CS.getInstruction()) == LazyValueInfo::False)
Indices.push_back(ArgNo + 1);
ArgNo++;
}
assert(ArgNo == CS.arg_size() && "sanity check");
if (Indices.empty())
return false;
AttributeSet AS = CS.getAttributes();
LLVMContext &Ctx = CS.getInstruction()->getContext();
AS = AS.addAttribute(Ctx, Indices, Attribute::get(Ctx, Attribute::NonNull));
CS.setAttributes(AS);
return true;
}
// Helper function to rewrite srem and sdiv. As a policy choice, we choose not
// to waste compile time on anything where the operands are local defs. While
// LVI can sometimes reason about such cases, it's not its primary purpose.
static bool hasLocalDefs(BinaryOperator *SDI) {
for (Value *O : SDI->operands()) {
auto *I = dyn_cast<Instruction>(O);
if (I && I->getParent() == SDI->getParent())
return true;
}
return false;
}
static bool hasPositiveOperands(BinaryOperator *SDI, LazyValueInfo *LVI) {
Constant *Zero = ConstantInt::get(SDI->getType(), 0);
for (Value *O : SDI->operands()) {
auto Result = LVI->getPredicateAt(ICmpInst::ICMP_SGE, O, Zero, SDI);
if (Result != LazyValueInfo::True)
return false;
}
return true;
}
static bool processSRem(BinaryOperator *SDI, LazyValueInfo *LVI) {
if (SDI->getType()->isVectorTy() || hasLocalDefs(SDI) ||
!hasPositiveOperands(SDI, LVI))
return false;
++NumSRems;
auto *BO = BinaryOperator::CreateURem(SDI->getOperand(0), SDI->getOperand(1),
SDI->getName(), SDI);
SDI->replaceAllUsesWith(BO);
SDI->eraseFromParent();
return true;
}
/// See if LazyValueInfo's ability to exploit edge conditions or range
/// information is sufficient to prove the both operands of this SDiv are
/// positive. If this is the case, replace the SDiv with a UDiv. Even for local
/// conditions, this can sometimes prove conditions instcombine can't by
/// exploiting range information.
static bool processSDiv(BinaryOperator *SDI, LazyValueInfo *LVI) {
if (SDI->getType()->isVectorTy() || hasLocalDefs(SDI) ||
!hasPositiveOperands(SDI, LVI))
return false;
++NumSDivs;
auto *BO = BinaryOperator::CreateUDiv(SDI->getOperand(0), SDI->getOperand(1),
SDI->getName(), SDI);
BO->setIsExact(SDI->isExact());
SDI->replaceAllUsesWith(BO);
SDI->eraseFromParent();
return true;
}
static bool processAShr(BinaryOperator *SDI, LazyValueInfo *LVI) {
if (SDI->getType()->isVectorTy() || hasLocalDefs(SDI))
return false;
Constant *Zero = ConstantInt::get(SDI->getType(), 0);
if (LVI->getPredicateAt(ICmpInst::ICMP_SGE, SDI->getOperand(0), Zero, SDI) !=
LazyValueInfo::True)
return false;
++NumAShrs;
auto *BO = BinaryOperator::CreateLShr(SDI->getOperand(0), SDI->getOperand(1),
SDI->getName(), SDI);
BO->setIsExact(SDI->isExact());
SDI->replaceAllUsesWith(BO);
SDI->eraseFromParent();
return true;
}
static bool processAdd(BinaryOperator *AddOp, LazyValueInfo *LVI) {
typedef OverflowingBinaryOperator OBO;
+
+ if (DontProcessAdds)
+ return false;
if (AddOp->getType()->isVectorTy() || hasLocalDefs(AddOp))
return false;
bool NSW = AddOp->hasNoSignedWrap();
bool NUW = AddOp->hasNoUnsignedWrap();
if (NSW && NUW)
return false;
BasicBlock *BB = AddOp->getParent();
Value *LHS = AddOp->getOperand(0);
Value *RHS = AddOp->getOperand(1);
ConstantRange LRange = LVI->getConstantRange(LHS, BB, AddOp);
// Initialize RRange only if we need it. If we know that guaranteed no wrap
// range for the given LHS range is empty don't spend time calculating the
// range for the RHS.
Optional<ConstantRange> RRange;
auto LazyRRange = [&] () {
if (!RRange)
RRange = LVI->getConstantRange(RHS, BB, AddOp);
return RRange.getValue();
};
bool Changed = false;
if (!NUW) {
ConstantRange NUWRange =
LRange.makeGuaranteedNoWrapRegion(BinaryOperator::Add, LRange,
OBO::NoUnsignedWrap);
if (!NUWRange.isEmptySet()) {
bool NewNUW = NUWRange.contains(LazyRRange());
AddOp->setHasNoUnsignedWrap(NewNUW);
Changed |= NewNUW;
}
}
if (!NSW) {
ConstantRange NSWRange =
LRange.makeGuaranteedNoWrapRegion(BinaryOperator::Add, LRange,
OBO::NoSignedWrap);
if (!NSWRange.isEmptySet()) {
bool NewNSW = NSWRange.contains(LazyRRange());
AddOp->setHasNoSignedWrap(NewNSW);
Changed |= NewNSW;
}
}
return Changed;
}
static Constant *getConstantAt(Value *V, Instruction *At, LazyValueInfo *LVI) {
if (Constant *C = LVI->getConstant(V, At->getParent(), At))
return C;
// TODO: The following really should be sunk inside LVI's core algorithm, or
// at least the outer shims around such.
auto *C = dyn_cast<CmpInst>(V);
if (!C) return nullptr;
Value *Op0 = C->getOperand(0);
Constant *Op1 = dyn_cast<Constant>(C->getOperand(1));
if (!Op1) return nullptr;
LazyValueInfo::Tristate Result =
LVI->getPredicateAt(C->getPredicate(), Op0, Op1, At);
if (Result == LazyValueInfo::Unknown)
return nullptr;
return (Result == LazyValueInfo::True) ?
ConstantInt::getTrue(C->getContext()) :
ConstantInt::getFalse(C->getContext());
}
static bool runImpl(Function &F, LazyValueInfo *LVI) {
bool FnChanged = false;
// Visiting in a pre-order depth-first traversal causes us to simplify early
// blocks before querying later blocks (which require us to analyze early
// blocks). Eagerly simplifying shallow blocks means there is strictly less
// work to do for deep blocks. This also means we don't visit unreachable
// blocks.
for (BasicBlock *BB : depth_first(&F.getEntryBlock())) {
bool BBChanged = false;
for (BasicBlock::iterator BI = BB->begin(), BE = BB->end(); BI != BE;) {
Instruction *II = &*BI++;
switch (II->getOpcode()) {
case Instruction::Select:
BBChanged |= processSelect(cast<SelectInst>(II), LVI);
break;
case Instruction::PHI:
BBChanged |= processPHI(cast<PHINode>(II), LVI);
break;
case Instruction::ICmp:
case Instruction::FCmp:
BBChanged |= processCmp(cast<CmpInst>(II), LVI);
break;
case Instruction::Load:
case Instruction::Store:
BBChanged |= processMemAccess(II, LVI);
break;
case Instruction::Call:
case Instruction::Invoke:
BBChanged |= processCallSite(CallSite(II), LVI);
break;
case Instruction::SRem:
BBChanged |= processSRem(cast<BinaryOperator>(II), LVI);
break;
case Instruction::SDiv:
BBChanged |= processSDiv(cast<BinaryOperator>(II), LVI);
break;
case Instruction::AShr:
BBChanged |= processAShr(cast<BinaryOperator>(II), LVI);
break;
case Instruction::Add:
BBChanged |= processAdd(cast<BinaryOperator>(II), LVI);
break;
}
}
Instruction *Term = BB->getTerminator();
switch (Term->getOpcode()) {
case Instruction::Switch:
BBChanged |= processSwitch(cast<SwitchInst>(Term), LVI);
break;
case Instruction::Ret: {
auto *RI = cast<ReturnInst>(Term);
// Try to determine the return value if we can. This is mainly here to
// simplify the writing of unit tests, but also helps to enable IPO by
// constant folding the return values of callees.
auto *RetVal = RI->getReturnValue();
if (!RetVal) break; // handle "ret void"
if (isa<Constant>(RetVal)) break; // nothing to do
if (auto *C = getConstantAt(RetVal, RI, LVI)) {
++NumReturns;
RI->replaceUsesOfWith(RetVal, C);
BBChanged = true;
}
}
};
FnChanged |= BBChanged;
}
return FnChanged;
}
bool CorrelatedValuePropagation::runOnFunction(Function &F) {
if (skipFunction(F))
return false;
LazyValueInfo *LVI = &getAnalysis<LazyValueInfoWrapperPass>().getLVI();
return runImpl(F, LVI);
}
PreservedAnalyses
CorrelatedValuePropagationPass::run(Function &F, FunctionAnalysisManager &AM) {
LazyValueInfo *LVI = &AM.getResult<LazyValueAnalysis>(F);
bool Changed = runImpl(F, LVI);
// FIXME: We need to invalidate LVI to avoid PR28400. Is there a better
// solution?
AM.invalidate<LazyValueAnalysis>(F);
if (!Changed)
return PreservedAnalyses::all();
PreservedAnalyses PA;
PA.preserve<GlobalsAA>();
return PA;
}
Index: projects/clang400-import/contrib/llvm/lib/Transforms/Scalar/Reassociate.cpp
===================================================================
--- projects/clang400-import/contrib/llvm/lib/Transforms/Scalar/Reassociate.cpp (revision 314268)
+++ projects/clang400-import/contrib/llvm/lib/Transforms/Scalar/Reassociate.cpp (revision 314269)
@@ -1,2280 +1,2280 @@
//===- Reassociate.cpp - Reassociate binary expressions -------------------===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
// This pass reassociates commutative expressions in an order that is designed
// to promote better constant propagation, GCSE, LICM, PRE, etc.
//
// For example: 4 + (x + 5) -> x + (4 + 5)
//
// In the implementation of this algorithm, constants are assigned rank = 0,
// function arguments are rank = 1, and other values are assigned ranks
// corresponding to the reverse post order traversal of current function
// (starting at 2), which effectively gives values in deep loops higher rank
// than values not in loops.
//
//===----------------------------------------------------------------------===//
#include "llvm/Transforms/Scalar/Reassociate.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/PostOrderIterator.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SetVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/GlobalsModRef.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/IR/CFG.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/ValueHandle.h"
#include "llvm/Pass.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Utils/Local.h"
#include <algorithm>
using namespace llvm;
using namespace reassociate;
#define DEBUG_TYPE "reassociate"
STATISTIC(NumChanged, "Number of insts reassociated");
STATISTIC(NumAnnihil, "Number of expr tree annihilated");
STATISTIC(NumFactor , "Number of multiplies factored");
#ifndef NDEBUG
/// Print out the expression identified in the Ops list.
///
static void PrintOps(Instruction *I, const SmallVectorImpl<ValueEntry> &Ops) {
Module *M = I->getModule();
dbgs() << Instruction::getOpcodeName(I->getOpcode()) << " "
<< *Ops[0].Op->getType() << '\t';
for (unsigned i = 0, e = Ops.size(); i != e; ++i) {
dbgs() << "[ ";
Ops[i].Op->printAsOperand(dbgs(), false, M);
dbgs() << ", #" << Ops[i].Rank << "] ";
}
}
#endif
/// Utility class representing a non-constant Xor-operand. We classify
/// non-constant Xor-Operands into two categories:
/// C1) The operand is in the form "X & C", where C is a constant and C != ~0
/// C2)
/// C2.1) The operand is in the form of "X | C", where C is a non-zero
/// constant.
/// C2.2) Any operand E which doesn't fall into C1 and C2.1, we view this
/// operand as "E | 0"
class llvm::reassociate::XorOpnd {
public:
XorOpnd(Value *V);
bool isInvalid() const { return SymbolicPart == nullptr; }
bool isOrExpr() const { return isOr; }
Value *getValue() const { return OrigVal; }
Value *getSymbolicPart() const { return SymbolicPart; }
unsigned getSymbolicRank() const { return SymbolicRank; }
const APInt &getConstPart() const { return ConstPart; }
void Invalidate() { SymbolicPart = OrigVal = nullptr; }
void setSymbolicRank(unsigned R) { SymbolicRank = R; }
private:
Value *OrigVal;
Value *SymbolicPart;
APInt ConstPart;
unsigned SymbolicRank;
bool isOr;
};
XorOpnd::XorOpnd(Value *V) {
assert(!isa<ConstantInt>(V) && "No ConstantInt");
OrigVal = V;
Instruction *I = dyn_cast<Instruction>(V);
SymbolicRank = 0;
if (I && (I->getOpcode() == Instruction::Or ||
I->getOpcode() == Instruction::And)) {
Value *V0 = I->getOperand(0);
Value *V1 = I->getOperand(1);
if (isa<ConstantInt>(V0))
std::swap(V0, V1);
if (ConstantInt *C = dyn_cast<ConstantInt>(V1)) {
ConstPart = C->getValue();
SymbolicPart = V0;
isOr = (I->getOpcode() == Instruction::Or);
return;
}
}
// view the operand as "V | 0"
SymbolicPart = V;
ConstPart = APInt::getNullValue(V->getType()->getIntegerBitWidth());
isOr = true;
}
/// Return true if V is an instruction of the specified opcode and if it
/// only has one use.
static BinaryOperator *isReassociableOp(Value *V, unsigned Opcode) {
if (V->hasOneUse() && isa<Instruction>(V) &&
cast<Instruction>(V)->getOpcode() == Opcode &&
(!isa<FPMathOperator>(V) ||
cast<Instruction>(V)->hasUnsafeAlgebra()))
return cast<BinaryOperator>(V);
return nullptr;
}
static BinaryOperator *isReassociableOp(Value *V, unsigned Opcode1,
unsigned Opcode2) {
if (V->hasOneUse() && isa<Instruction>(V) &&
(cast<Instruction>(V)->getOpcode() == Opcode1 ||
cast<Instruction>(V)->getOpcode() == Opcode2) &&
(!isa<FPMathOperator>(V) ||
cast<Instruction>(V)->hasUnsafeAlgebra()))
return cast<BinaryOperator>(V);
return nullptr;
}
void ReassociatePass::BuildRankMap(Function &F,
ReversePostOrderTraversal<Function*> &RPOT) {
unsigned i = 2;
// Assign distinct ranks to function arguments.
for (Function::arg_iterator I = F.arg_begin(), E = F.arg_end(); I != E; ++I) {
ValueRankMap[&*I] = ++i;
DEBUG(dbgs() << "Calculated Rank[" << I->getName() << "] = " << i << "\n");
}
// Traverse basic blocks in ReversePostOrder
for (BasicBlock *BB : RPOT) {
unsigned BBRank = RankMap[BB] = ++i << 16;
// Walk the basic block, adding precomputed ranks for any instructions that
// we cannot move. This ensures that the ranks for these instructions are
// all different in the block.
for (Instruction &I : *BB)
if (mayBeMemoryDependent(I))
ValueRankMap[&I] = ++BBRank;
}
}
unsigned ReassociatePass::getRank(Value *V) {
Instruction *I = dyn_cast<Instruction>(V);
if (!I) {
if (isa<Argument>(V)) return ValueRankMap[V]; // Function argument.
return 0; // Otherwise it's a global or constant, rank 0.
}
if (unsigned Rank = ValueRankMap[I])
return Rank; // Rank already known?
// If this is an expression, return the 1+MAX(rank(LHS), rank(RHS)) so that
// we can reassociate expressions for code motion! Since we do not recurse
// for PHI nodes, we cannot have infinite recursion here, because there
// cannot be loops in the value graph that do not go through PHI nodes.
unsigned Rank = 0, MaxRank = RankMap[I->getParent()];
for (unsigned i = 0, e = I->getNumOperands();
i != e && Rank != MaxRank; ++i)
Rank = std::max(Rank, getRank(I->getOperand(i)));
// If this is a not or neg instruction, do not count it for rank. This
// assures us that X and ~X will have the same rank.
if (!BinaryOperator::isNot(I) && !BinaryOperator::isNeg(I) &&
!BinaryOperator::isFNeg(I))
++Rank;
DEBUG(dbgs() << "Calculated Rank[" << V->getName() << "] = " << Rank << "\n");
return ValueRankMap[I] = Rank;
}
// Canonicalize constants to RHS. Otherwise, sort the operands by rank.
void ReassociatePass::canonicalizeOperands(Instruction *I) {
assert(isa<BinaryOperator>(I) && "Expected binary operator.");
assert(I->isCommutative() && "Expected commutative operator.");
Value *LHS = I->getOperand(0);
Value *RHS = I->getOperand(1);
unsigned LHSRank = getRank(LHS);
unsigned RHSRank = getRank(RHS);
if (isa<Constant>(RHS))
return;
if (isa<Constant>(LHS) || RHSRank < LHSRank)
cast<BinaryOperator>(I)->swapOperands();
}
static BinaryOperator *CreateAdd(Value *S1, Value *S2, const Twine &Name,
Instruction *InsertBefore, Value *FlagsOp) {
if (S1->getType()->isIntOrIntVectorTy())
return BinaryOperator::CreateAdd(S1, S2, Name, InsertBefore);
else {
BinaryOperator *Res =
BinaryOperator::CreateFAdd(S1, S2, Name, InsertBefore);
Res->setFastMathFlags(cast<FPMathOperator>(FlagsOp)->getFastMathFlags());
return Res;
}
}
static BinaryOperator *CreateMul(Value *S1, Value *S2, const Twine &Name,
Instruction *InsertBefore, Value *FlagsOp) {
if (S1->getType()->isIntOrIntVectorTy())
return BinaryOperator::CreateMul(S1, S2, Name, InsertBefore);
else {
BinaryOperator *Res =
BinaryOperator::CreateFMul(S1, S2, Name, InsertBefore);
Res->setFastMathFlags(cast<FPMathOperator>(FlagsOp)->getFastMathFlags());
return Res;
}
}
static BinaryOperator *CreateNeg(Value *S1, const Twine &Name,
Instruction *InsertBefore, Value *FlagsOp) {
if (S1->getType()->isIntOrIntVectorTy())
return BinaryOperator::CreateNeg(S1, Name, InsertBefore);
else {
BinaryOperator *Res = BinaryOperator::CreateFNeg(S1, Name, InsertBefore);
Res->setFastMathFlags(cast<FPMathOperator>(FlagsOp)->getFastMathFlags());
return Res;
}
}
/// Replace 0-X with X*-1.
static BinaryOperator *LowerNegateToMultiply(Instruction *Neg) {
Type *Ty = Neg->getType();
Constant *NegOne = Ty->isIntOrIntVectorTy() ?
ConstantInt::getAllOnesValue(Ty) : ConstantFP::get(Ty, -1.0);
BinaryOperator *Res = CreateMul(Neg->getOperand(1), NegOne, "", Neg, Neg);
Neg->setOperand(1, Constant::getNullValue(Ty)); // Drop use of op.
Res->takeName(Neg);
Neg->replaceAllUsesWith(Res);
Res->setDebugLoc(Neg->getDebugLoc());
return Res;
}
/// Returns k such that lambda(2^Bitwidth) = 2^k, where lambda is the Carmichael
/// function. This means that x^(2^k) === 1 mod 2^Bitwidth for
/// every odd x, i.e. x^(2^k) = 1 for every odd x in Bitwidth-bit arithmetic.
/// Note that 0 <= k < Bitwidth, and if Bitwidth > 3 then x^(2^k) = 0 for every
/// even x in Bitwidth-bit arithmetic.
static unsigned CarmichaelShift(unsigned Bitwidth) {
if (Bitwidth < 3)
return Bitwidth - 1;
return Bitwidth - 2;
}
/// Add the extra weight 'RHS' to the existing weight 'LHS',
/// reducing the combined weight using any special properties of the operation.
/// The existing weight LHS represents the computation X op X op ... op X where
/// X occurs LHS times. The combined weight represents X op X op ... op X with
/// X occurring LHS + RHS times. If op is "Xor" for example then the combined
/// operation is equivalent to X if LHS + RHS is odd, or 0 if LHS + RHS is even;
/// the routine returns 1 in LHS in the first case, and 0 in LHS in the second.
static void IncorporateWeight(APInt &LHS, const APInt &RHS, unsigned Opcode) {
// If we were working with infinite precision arithmetic then the combined
// weight would be LHS + RHS. But we are using finite precision arithmetic,
// and the APInt sum LHS + RHS may not be correct if it wraps (it is correct
// for nilpotent operations and addition, but not for idempotent operations
// and multiplication), so it is important to correctly reduce the combined
// weight back into range if wrapping would be wrong.
// If RHS is zero then the weight didn't change.
if (RHS.isMinValue())
return;
// If LHS is zero then the combined weight is RHS.
if (LHS.isMinValue()) {
LHS = RHS;
return;
}
// From this point on we know that neither LHS nor RHS is zero.
if (Instruction::isIdempotent(Opcode)) {
// Idempotent means X op X === X, so any non-zero weight is equivalent to a
// weight of 1. Keeping weights at zero or one also means that wrapping is
// not a problem.
assert(LHS == 1 && RHS == 1 && "Weights not reduced!");
return; // Return a weight of 1.
}
if (Instruction::isNilpotent(Opcode)) {
// Nilpotent means X op X === 0, so reduce weights modulo 2.
assert(LHS == 1 && RHS == 1 && "Weights not reduced!");
LHS = 0; // 1 + 1 === 0 modulo 2.
return;
}
if (Opcode == Instruction::Add || Opcode == Instruction::FAdd) {
// TODO: Reduce the weight by exploiting nsw/nuw?
LHS += RHS;
return;
}
assert((Opcode == Instruction::Mul || Opcode == Instruction::FMul) &&
"Unknown associative operation!");
unsigned Bitwidth = LHS.getBitWidth();
// If CM is the Carmichael number then a weight W satisfying W >= CM+Bitwidth
// can be replaced with W-CM. That's because x^W=x^(W-CM) for every Bitwidth
// bit number x, since either x is odd in which case x^CM = 1, or x is even in
// which case both x^W and x^(W - CM) are zero. By subtracting off multiples
// of CM like this weights can always be reduced to the range [0, CM+Bitwidth)
// which by a happy accident means that they can always be represented using
// Bitwidth bits.
// TODO: Reduce the weight by exploiting nsw/nuw? (Could do much better than
// the Carmichael number).
if (Bitwidth > 3) {
/// CM - The value of Carmichael's lambda function.
APInt CM = APInt::getOneBitSet(Bitwidth, CarmichaelShift(Bitwidth));
// Any weight W >= Threshold can be replaced with W - CM.
APInt Threshold = CM + Bitwidth;
assert(LHS.ult(Threshold) && RHS.ult(Threshold) && "Weights not reduced!");
// For Bitwidth 4 or more the following sum does not overflow.
LHS += RHS;
while (LHS.uge(Threshold))
LHS -= CM;
} else {
// To avoid problems with overflow do everything the same as above but using
// a larger type.
unsigned CM = 1U << CarmichaelShift(Bitwidth);
unsigned Threshold = CM + Bitwidth;
assert(LHS.getZExtValue() < Threshold && RHS.getZExtValue() < Threshold &&
"Weights not reduced!");
unsigned Total = LHS.getZExtValue() + RHS.getZExtValue();
while (Total >= Threshold)
Total -= CM;
LHS = Total;
}
}
typedef std::pair<Value*, APInt> RepeatedValue;
/// Given an associative binary expression, return the leaf
/// nodes in Ops along with their weights (how many times the leaf occurs). The
/// original expression is the same as
/// (Ops[0].first op Ops[0].first op ... Ops[0].first) <- Ops[0].second times
/// op
/// (Ops[1].first op Ops[1].first op ... Ops[1].first) <- Ops[1].second times
/// op
/// ...
/// op
/// (Ops[N].first op Ops[N].first op ... Ops[N].first) <- Ops[N].second times
///
/// Note that the values Ops[0].first, ..., Ops[N].first are all distinct.
///
/// This routine may modify the function, in which case it returns 'true'. The
/// changes it makes may well be destructive, changing the value computed by 'I'
/// to something completely different. Thus if the routine returns 'true' then
/// you MUST either replace I with a new expression computed from the Ops array,
/// or use RewriteExprTree to put the values back in.
///
/// A leaf node is either not a binary operation of the same kind as the root
/// node 'I' (i.e. is not a binary operator at all, or is, but with a different
/// opcode), or is the same kind of binary operator but has a use which either
/// does not belong to the expression, or does belong to the expression but is
/// a leaf node. Every leaf node has at least one use that is a non-leaf node
/// of the expression, while for non-leaf nodes (except for the root 'I') every
/// use is a non-leaf node of the expression.
///
/// For example:
/// expression graph node names
///
/// + | I
/// / \ |
/// + + | A, B
/// / \ / \ |
/// * + * | C, D, E
/// / \ / \ / \ |
/// + * | F, G
///
/// The leaf nodes are C, E, F and G. The Ops array will contain (maybe not in
/// that order) (C, 1), (E, 1), (F, 2), (G, 2).
///
/// The expression is maximal: if some instruction is a binary operator of the
/// same kind as 'I', and all of its uses are non-leaf nodes of the expression,
/// then the instruction also belongs to the expression, is not a leaf node of
/// it, and its operands also belong to the expression (but may be leaf nodes).
///
/// NOTE: This routine will set operands of non-leaf non-root nodes to undef in
/// order to ensure that every non-root node in the expression has *exactly one*
/// use by a non-leaf node of the expression. This destruction means that the
/// caller MUST either replace 'I' with a new expression or use something like
/// RewriteExprTree to put the values back in if the routine indicates that it
/// made a change by returning 'true'.
///
/// In the above example either the right operand of A or the left operand of B
/// will be replaced by undef. If it is B's operand then this gives:
///
/// + | I
/// / \ |
/// + + | A, B - operand of B replaced with undef
/// / \ \ |
/// * + * | C, D, E
/// / \ / \ / \ |
/// + * | F, G
///
/// Note that such undef operands can only be reached by passing through 'I'.
/// For example, if you visit operands recursively starting from a leaf node
/// then you will never see such an undef operand unless you get back to 'I',
/// which requires passing through a phi node.
///
/// Note that this routine may also mutate binary operators of the wrong type
/// that have all uses inside the expression (i.e. only used by non-leaf nodes
/// of the expression) if it can turn them into binary operators of the right
/// type and thus make the expression bigger.
static bool LinearizeExprTree(BinaryOperator *I,
SmallVectorImpl<RepeatedValue> &Ops) {
DEBUG(dbgs() << "LINEARIZE: " << *I << '\n');
unsigned Bitwidth = I->getType()->getScalarType()->getPrimitiveSizeInBits();
unsigned Opcode = I->getOpcode();
assert(I->isAssociative() && I->isCommutative() &&
"Expected an associative and commutative operation!");
// Visit all operands of the expression, keeping track of their weight (the
// number of paths from the expression root to the operand, or if you like
// the number of times that operand occurs in the linearized expression).
// For example, if I = X + A, where X = A + B, then I, X and B have weight 1
// while A has weight two.
// Worklist of non-leaf nodes (their operands are in the expression too) along
// with their weights, representing a certain number of paths to the operator.
// If an operator occurs in the worklist multiple times then we found multiple
// ways to get to it.
SmallVector<std::pair<BinaryOperator*, APInt>, 8> Worklist; // (Op, Weight)
Worklist.push_back(std::make_pair(I, APInt(Bitwidth, 1)));
bool Changed = false;
// Leaves of the expression are values that either aren't the right kind of
// operation (eg: a constant, or a multiply in an add tree), or are, but have
// some uses that are not inside the expression. For example, in I = X + X,
// X = A + B, the value X has two uses (by I) that are in the expression. If
// X has any other uses, for example in a return instruction, then we consider
// X to be a leaf, and won't analyze it further. When we first visit a value,
// if it has more than one use then at first we conservatively consider it to
// be a leaf. Later, as the expression is explored, we may discover some more
// uses of the value from inside the expression. If all uses turn out to be
// from within the expression (and the value is a binary operator of the right
// kind) then the value is no longer considered to be a leaf, and its operands
// are explored.
// Leaves - Keeps track of the set of putative leaves as well as the number of
// paths to each leaf seen so far.
typedef DenseMap<Value*, APInt> LeafMap;
LeafMap Leaves; // Leaf -> Total weight so far.
SmallVector<Value*, 8> LeafOrder; // Ensure deterministic leaf output order.
#ifndef NDEBUG
SmallPtrSet<Value*, 8> Visited; // For sanity checking the iteration scheme.
#endif
while (!Worklist.empty()) {
std::pair<BinaryOperator*, APInt> P = Worklist.pop_back_val();
I = P.first; // We examine the operands of this binary operator.
for (unsigned OpIdx = 0; OpIdx < 2; ++OpIdx) { // Visit operands.
Value *Op = I->getOperand(OpIdx);
APInt Weight = P.second; // Number of paths to this operand.
DEBUG(dbgs() << "OPERAND: " << *Op << " (" << Weight << ")\n");
assert(!Op->use_empty() && "No uses, so how did we get to it?!");
// If this is a binary operation of the right kind with only one use then
// add its operands to the expression.
if (BinaryOperator *BO = isReassociableOp(Op, Opcode)) {
assert(Visited.insert(Op).second && "Not first visit!");
DEBUG(dbgs() << "DIRECT ADD: " << *Op << " (" << Weight << ")\n");
Worklist.push_back(std::make_pair(BO, Weight));
continue;
}
// Appears to be a leaf. Is the operand already in the set of leaves?
LeafMap::iterator It = Leaves.find(Op);
if (It == Leaves.end()) {
// Not in the leaf map. Must be the first time we saw this operand.
assert(Visited.insert(Op).second && "Not first visit!");
if (!Op->hasOneUse()) {
// This value has uses not accounted for by the expression, so it is
// not safe to modify. Mark it as being a leaf.
DEBUG(dbgs() << "ADD USES LEAF: " << *Op << " (" << Weight << ")\n");
LeafOrder.push_back(Op);
Leaves[Op] = Weight;
continue;
}
// No uses outside the expression, try morphing it.
} else {
// Already in the leaf map.
assert(It != Leaves.end() && Visited.count(Op) &&
"In leaf map but not visited!");
// Update the number of paths to the leaf.
IncorporateWeight(It->second, Weight, Opcode);
#if 0 // TODO: Re-enable once PR13021 is fixed.
// The leaf already has one use from inside the expression. As we want
// exactly one such use, drop this new use of the leaf.
assert(!Op->hasOneUse() && "Only one use, but we got here twice!");
I->setOperand(OpIdx, UndefValue::get(I->getType()));
Changed = true;
// If the leaf is a binary operation of the right kind and we now see
// that its multiple original uses were in fact all by nodes belonging
// to the expression, then no longer consider it to be a leaf and add
// its operands to the expression.
if (BinaryOperator *BO = isReassociableOp(Op, Opcode)) {
DEBUG(dbgs() << "UNLEAF: " << *Op << " (" << It->second << ")\n");
Worklist.push_back(std::make_pair(BO, It->second));
Leaves.erase(It);
continue;
}
#endif
// If we still have uses that are not accounted for by the expression
// then it is not safe to modify the value.
if (!Op->hasOneUse())
continue;
// No uses outside the expression, try morphing it.
Weight = It->second;
Leaves.erase(It); // Since the value may be morphed below.
}
// At this point we have a value which, first of all, is not a binary
// expression of the right kind, and secondly, is only used inside the
// expression. This means that it can safely be modified. See if we
// can usefully morph it into an expression of the right kind.
assert((!isa<Instruction>(Op) ||
cast<Instruction>(Op)->getOpcode() != Opcode
|| (isa<FPMathOperator>(Op) &&
!cast<Instruction>(Op)->hasUnsafeAlgebra())) &&
"Should have been handled above!");
assert(Op->hasOneUse() && "Has uses outside the expression tree!");
// If this is a multiply expression, turn any internal negations into
// multiplies by -1 so they can be reassociated.
if (BinaryOperator *BO = dyn_cast<BinaryOperator>(Op))
if ((Opcode == Instruction::Mul && BinaryOperator::isNeg(BO)) ||
(Opcode == Instruction::FMul && BinaryOperator::isFNeg(BO))) {
DEBUG(dbgs() << "MORPH LEAF: " << *Op << " (" << Weight << ") TO ");
BO = LowerNegateToMultiply(BO);
DEBUG(dbgs() << *BO << '\n');
Worklist.push_back(std::make_pair(BO, Weight));
Changed = true;
continue;
}
// Failed to morph into an expression of the right type. This really is
// a leaf.
DEBUG(dbgs() << "ADD LEAF: " << *Op << " (" << Weight << ")\n");
assert(!isReassociableOp(Op, Opcode) && "Value was morphed?");
LeafOrder.push_back(Op);
Leaves[Op] = Weight;
}
}
// The leaves, repeated according to their weights, represent the linearized
// form of the expression.
for (unsigned i = 0, e = LeafOrder.size(); i != e; ++i) {
Value *V = LeafOrder[i];
LeafMap::iterator It = Leaves.find(V);
if (It == Leaves.end())
// Node initially thought to be a leaf wasn't.
continue;
assert(!isReassociableOp(V, Opcode) && "Shouldn't be a leaf!");
APInt Weight = It->second;
if (Weight.isMinValue())
// Leaf already output or weight reduction eliminated it.
continue;
// Ensure the leaf is only output once.
It->second = 0;
Ops.push_back(std::make_pair(V, Weight));
}
// For nilpotent operations or addition there may be no operands, for example
// because the expression was "X xor X" or consisted of 2^Bitwidth additions:
// in both cases the weight reduces to 0 causing the value to be skipped.
if (Ops.empty()) {
Constant *Identity = ConstantExpr::getBinOpIdentity(Opcode, I->getType());
assert(Identity && "Associative operation without identity!");
Ops.emplace_back(Identity, APInt(Bitwidth, 1));
}
return Changed;
}
/// Now that the operands for this expression tree are
/// linearized and optimized, emit them in-order.
void ReassociatePass::RewriteExprTree(BinaryOperator *I,
SmallVectorImpl<ValueEntry> &Ops) {
assert(Ops.size() > 1 && "Single values should be used directly!");
// Since our optimizations should never increase the number of operations, the
// new expression can usually be written reusing the existing binary operators
// from the original expression tree, without creating any new instructions,
// though the rewritten expression may have a completely different topology.
// We take care to not change anything if the new expression will be the same
// as the original. If more than trivial changes (like commuting operands)
// were made then we are obliged to clear out any optional subclass data like
// nsw flags.
/// NodesToRewrite - Nodes from the original expression available for writing
/// the new expression into.
SmallVector<BinaryOperator*, 8> NodesToRewrite;
unsigned Opcode = I->getOpcode();
BinaryOperator *Op = I;
/// NotRewritable - The operands being written will be the leaves of the new
/// expression and must not be used as inner nodes (via NodesToRewrite) by
/// mistake. Inner nodes are always reassociable, and usually leaves are not
/// (if they were they would have been incorporated into the expression and so
/// would not be leaves), so most of the time there is no danger of this. But
/// in rare cases a leaf may become reassociable if an optimization kills uses
/// of it, or it may momentarily become reassociable during rewriting (below)
/// due it being removed as an operand of one of its uses. Ensure that misuse
/// of leaf nodes as inner nodes cannot occur by remembering all of the future
/// leaves and refusing to reuse any of them as inner nodes.
SmallPtrSet<Value*, 8> NotRewritable;
for (unsigned i = 0, e = Ops.size(); i != e; ++i)
NotRewritable.insert(Ops[i].Op);
// ExpressionChanged - Non-null if the rewritten expression differs from the
// original in some non-trivial way, requiring the clearing of optional flags.
// Flags are cleared from the operator in ExpressionChanged up to I inclusive.
BinaryOperator *ExpressionChanged = nullptr;
for (unsigned i = 0; ; ++i) {
// The last operation (which comes earliest in the IR) is special as both
// operands will come from Ops, rather than just one with the other being
// a subexpression.
if (i+2 == Ops.size()) {
Value *NewLHS = Ops[i].Op;
Value *NewRHS = Ops[i+1].Op;
Value *OldLHS = Op->getOperand(0);
Value *OldRHS = Op->getOperand(1);
if (NewLHS == OldLHS && NewRHS == OldRHS)
// Nothing changed, leave it alone.
break;
if (NewLHS == OldRHS && NewRHS == OldLHS) {
// The order of the operands was reversed. Swap them.
DEBUG(dbgs() << "RA: " << *Op << '\n');
Op->swapOperands();
DEBUG(dbgs() << "TO: " << *Op << '\n');
MadeChange = true;
++NumChanged;
break;
}
// The new operation differs non-trivially from the original. Overwrite
// the old operands with the new ones.
DEBUG(dbgs() << "RA: " << *Op << '\n');
if (NewLHS != OldLHS) {
BinaryOperator *BO = isReassociableOp(OldLHS, Opcode);
if (BO && !NotRewritable.count(BO))
NodesToRewrite.push_back(BO);
Op->setOperand(0, NewLHS);
}
if (NewRHS != OldRHS) {
BinaryOperator *BO = isReassociableOp(OldRHS, Opcode);
if (BO && !NotRewritable.count(BO))
NodesToRewrite.push_back(BO);
Op->setOperand(1, NewRHS);
}
DEBUG(dbgs() << "TO: " << *Op << '\n');
ExpressionChanged = Op;
MadeChange = true;
++NumChanged;
break;
}
// Not the last operation. The left-hand side will be a sub-expression
// while the right-hand side will be the current element of Ops.
Value *NewRHS = Ops[i].Op;
if (NewRHS != Op->getOperand(1)) {
DEBUG(dbgs() << "RA: " << *Op << '\n');
if (NewRHS == Op->getOperand(0)) {
// The new right-hand side was already present as the left operand. If
// we are lucky then swapping the operands will sort out both of them.
Op->swapOperands();
} else {
// Overwrite with the new right-hand side.
BinaryOperator *BO = isReassociableOp(Op->getOperand(1), Opcode);
if (BO && !NotRewritable.count(BO))
NodesToRewrite.push_back(BO);
Op->setOperand(1, NewRHS);
ExpressionChanged = Op;
}
DEBUG(dbgs() << "TO: " << *Op << '\n');
MadeChange = true;
++NumChanged;
}
// Now deal with the left-hand side. If this is already an operation node
// from the original expression then just rewrite the rest of the expression
// into it.
BinaryOperator *BO = isReassociableOp(Op->getOperand(0), Opcode);
if (BO && !NotRewritable.count(BO)) {
Op = BO;
continue;
}
// Otherwise, grab a spare node from the original expression and use that as
// the left-hand side. If there are no nodes left then the optimizers made
// an expression with more nodes than the original! This usually means that
// they did something stupid but it might mean that the problem was just too
// hard (finding the mimimal number of multiplications needed to realize a
// multiplication expression is NP-complete). Whatever the reason, smart or
// stupid, create a new node if there are none left.
BinaryOperator *NewOp;
if (NodesToRewrite.empty()) {
Constant *Undef = UndefValue::get(I->getType());
NewOp = BinaryOperator::Create(Instruction::BinaryOps(Opcode),
Undef, Undef, "", I);
if (NewOp->getType()->isFPOrFPVectorTy())
NewOp->setFastMathFlags(I->getFastMathFlags());
} else {
NewOp = NodesToRewrite.pop_back_val();
}
DEBUG(dbgs() << "RA: " << *Op << '\n');
Op->setOperand(0, NewOp);
DEBUG(dbgs() << "TO: " << *Op << '\n');
ExpressionChanged = Op;
MadeChange = true;
++NumChanged;
Op = NewOp;
}
// If the expression changed non-trivially then clear out all subclass data
// starting from the operator specified in ExpressionChanged, and compactify
// the operators to just before the expression root to guarantee that the
// expression tree is dominated by all of Ops.
if (ExpressionChanged)
do {
// Preserve FastMathFlags.
if (isa<FPMathOperator>(I)) {
FastMathFlags Flags = I->getFastMathFlags();
ExpressionChanged->clearSubclassOptionalData();
ExpressionChanged->setFastMathFlags(Flags);
} else
ExpressionChanged->clearSubclassOptionalData();
if (ExpressionChanged == I)
break;
ExpressionChanged->moveBefore(I);
ExpressionChanged = cast<BinaryOperator>(*ExpressionChanged->user_begin());
} while (1);
// Throw away any left over nodes from the original expression.
for (unsigned i = 0, e = NodesToRewrite.size(); i != e; ++i)
RedoInsts.insert(NodesToRewrite[i]);
}
/// Insert instructions before the instruction pointed to by BI,
/// that computes the negative version of the value specified. The negative
/// version of the value is returned, and BI is left pointing at the instruction
/// that should be processed next by the reassociation pass.
/// Also add intermediate instructions to the redo list that are modified while
/// pushing the negates through adds. These will be revisited to see if
/// additional opportunities have been exposed.
static Value *NegateValue(Value *V, Instruction *BI,
SetVector<AssertingVH<Instruction>> &ToRedo) {
if (Constant *C = dyn_cast<Constant>(V)) {
if (C->getType()->isFPOrFPVectorTy()) {
return ConstantExpr::getFNeg(C);
}
return ConstantExpr::getNeg(C);
}
// We are trying to expose opportunity for reassociation. One of the things
// that we want to do to achieve this is to push a negation as deep into an
// expression chain as possible, to expose the add instructions. In practice,
// this means that we turn this:
// X = -(A+12+C+D) into X = -A + -12 + -C + -D = -12 + -A + -C + -D
// so that later, a: Y = 12+X could get reassociated with the -12 to eliminate
// the constants. We assume that instcombine will clean up the mess later if
// we introduce tons of unnecessary negation instructions.
//
if (BinaryOperator *I =
isReassociableOp(V, Instruction::Add, Instruction::FAdd)) {
// Push the negates through the add.
I->setOperand(0, NegateValue(I->getOperand(0), BI, ToRedo));
I->setOperand(1, NegateValue(I->getOperand(1), BI, ToRedo));
if (I->getOpcode() == Instruction::Add) {
I->setHasNoUnsignedWrap(false);
I->setHasNoSignedWrap(false);
}
// We must move the add instruction here, because the neg instructions do
// not dominate the old add instruction in general. By moving it, we are
// assured that the neg instructions we just inserted dominate the
// instruction we are about to insert after them.
//
I->moveBefore(BI);
I->setName(I->getName()+".neg");
// Add the intermediate negates to the redo list as processing them later
// could expose more reassociating opportunities.
ToRedo.insert(I);
return I;
}
// Okay, we need to materialize a negated version of V with an instruction.
// Scan the use lists of V to see if we have one already.
for (User *U : V->users()) {
if (!BinaryOperator::isNeg(U) && !BinaryOperator::isFNeg(U))
continue;
// We found one! Now we have to make sure that the definition dominates
// this use. We do this by moving it to the entry block (if it is a
// non-instruction value) or right after the definition. These negates will
// be zapped by reassociate later, so we don't need much finesse here.
BinaryOperator *TheNeg = cast<BinaryOperator>(U);
// Verify that the negate is in this function, V might be a constant expr.
if (TheNeg->getParent()->getParent() != BI->getParent()->getParent())
continue;
BasicBlock::iterator InsertPt;
if (Instruction *InstInput = dyn_cast<Instruction>(V)) {
if (InvokeInst *II = dyn_cast<InvokeInst>(InstInput)) {
InsertPt = II->getNormalDest()->begin();
} else {
InsertPt = ++InstInput->getIterator();
}
while (isa<PHINode>(InsertPt)) ++InsertPt;
} else {
InsertPt = TheNeg->getParent()->getParent()->getEntryBlock().begin();
}
TheNeg->moveBefore(&*InsertPt);
if (TheNeg->getOpcode() == Instruction::Sub) {
TheNeg->setHasNoUnsignedWrap(false);
TheNeg->setHasNoSignedWrap(false);
} else {
TheNeg->andIRFlags(BI);
}
ToRedo.insert(TheNeg);
return TheNeg;
}
// Insert a 'neg' instruction that subtracts the value from zero to get the
// negation.
BinaryOperator *NewNeg = CreateNeg(V, V->getName() + ".neg", BI, BI);
ToRedo.insert(NewNeg);
return NewNeg;
}
/// Return true if we should break up this subtract of X-Y into (X + -Y).
static bool ShouldBreakUpSubtract(Instruction *Sub) {
// If this is a negation, we can't split it up!
if (BinaryOperator::isNeg(Sub) || BinaryOperator::isFNeg(Sub))
return false;
// Don't breakup X - undef.
if (isa<UndefValue>(Sub->getOperand(1)))
return false;
// Don't bother to break this up unless either the LHS is an associable add or
// subtract or if this is only used by one.
Value *V0 = Sub->getOperand(0);
if (isReassociableOp(V0, Instruction::Add, Instruction::FAdd) ||
isReassociableOp(V0, Instruction::Sub, Instruction::FSub))
return true;
Value *V1 = Sub->getOperand(1);
if (isReassociableOp(V1, Instruction::Add, Instruction::FAdd) ||
isReassociableOp(V1, Instruction::Sub, Instruction::FSub))
return true;
Value *VB = Sub->user_back();
if (Sub->hasOneUse() &&
(isReassociableOp(VB, Instruction::Add, Instruction::FAdd) ||
isReassociableOp(VB, Instruction::Sub, Instruction::FSub)))
return true;
return false;
}
/// If we have (X-Y), and if either X is an add, or if this is only used by an
/// add, transform this into (X+(0-Y)) to promote better reassociation.
static BinaryOperator *
BreakUpSubtract(Instruction *Sub, SetVector<AssertingVH<Instruction>> &ToRedo) {
// Convert a subtract into an add and a neg instruction. This allows sub
// instructions to be commuted with other add instructions.
//
// Calculate the negative value of Operand 1 of the sub instruction,
// and set it as the RHS of the add instruction we just made.
//
Value *NegVal = NegateValue(Sub->getOperand(1), Sub, ToRedo);
BinaryOperator *New = CreateAdd(Sub->getOperand(0), NegVal, "", Sub, Sub);
Sub->setOperand(0, Constant::getNullValue(Sub->getType())); // Drop use of op.
Sub->setOperand(1, Constant::getNullValue(Sub->getType())); // Drop use of op.
New->takeName(Sub);
// Everyone now refers to the add instruction.
Sub->replaceAllUsesWith(New);
New->setDebugLoc(Sub->getDebugLoc());
DEBUG(dbgs() << "Negated: " << *New << '\n');
return New;
}
/// If this is a shift of a reassociable multiply or is used by one, change
/// this into a multiply by a constant to assist with further reassociation.
static BinaryOperator *ConvertShiftToMul(Instruction *Shl) {
Constant *MulCst = ConstantInt::get(Shl->getType(), 1);
MulCst = ConstantExpr::getShl(MulCst, cast<Constant>(Shl->getOperand(1)));
BinaryOperator *Mul =
BinaryOperator::CreateMul(Shl->getOperand(0), MulCst, "", Shl);
Shl->setOperand(0, UndefValue::get(Shl->getType())); // Drop use of op.
Mul->takeName(Shl);
// Everyone now refers to the mul instruction.
Shl->replaceAllUsesWith(Mul);
Mul->setDebugLoc(Shl->getDebugLoc());
// We can safely preserve the nuw flag in all cases. It's also safe to turn a
// nuw nsw shl into a nuw nsw mul. However, nsw in isolation requires special
// handling.
bool NSW = cast<BinaryOperator>(Shl)->hasNoSignedWrap();
bool NUW = cast<BinaryOperator>(Shl)->hasNoUnsignedWrap();
if (NSW && NUW)
Mul->setHasNoSignedWrap(true);
Mul->setHasNoUnsignedWrap(NUW);
return Mul;
}
/// Scan backwards and forwards among values with the same rank as element i
/// to see if X exists. If X does not exist, return i. This is useful when
/// scanning for 'x' when we see '-x' because they both get the same rank.
static unsigned FindInOperandList(SmallVectorImpl<ValueEntry> &Ops, unsigned i,
Value *X) {
unsigned XRank = Ops[i].Rank;
unsigned e = Ops.size();
for (unsigned j = i+1; j != e && Ops[j].Rank == XRank; ++j) {
if (Ops[j].Op == X)
return j;
if (Instruction *I1 = dyn_cast<Instruction>(Ops[j].Op))
if (Instruction *I2 = dyn_cast<Instruction>(X))
if (I1->isIdenticalTo(I2))
return j;
}
// Scan backwards.
for (unsigned j = i-1; j != ~0U && Ops[j].Rank == XRank; --j) {
if (Ops[j].Op == X)
return j;
if (Instruction *I1 = dyn_cast<Instruction>(Ops[j].Op))
if (Instruction *I2 = dyn_cast<Instruction>(X))
if (I1->isIdenticalTo(I2))
return j;
}
return i;
}
/// Emit a tree of add instructions, summing Ops together
/// and returning the result. Insert the tree before I.
static Value *EmitAddTreeOfValues(Instruction *I,
SmallVectorImpl<WeakVH> &Ops){
if (Ops.size() == 1) return Ops.back();
Value *V1 = Ops.back();
Ops.pop_back();
Value *V2 = EmitAddTreeOfValues(I, Ops);
return CreateAdd(V2, V1, "tmp", I, I);
}
/// If V is an expression tree that is a multiplication sequence,
/// and if this sequence contains a multiply by Factor,
/// remove Factor from the tree and return the new tree.
Value *ReassociatePass::RemoveFactorFromExpression(Value *V, Value *Factor) {
BinaryOperator *BO = isReassociableOp(V, Instruction::Mul, Instruction::FMul);
if (!BO)
return nullptr;
SmallVector<RepeatedValue, 8> Tree;
MadeChange |= LinearizeExprTree(BO, Tree);
SmallVector<ValueEntry, 8> Factors;
Factors.reserve(Tree.size());
for (unsigned i = 0, e = Tree.size(); i != e; ++i) {
RepeatedValue E = Tree[i];
Factors.append(E.second.getZExtValue(),
ValueEntry(getRank(E.first), E.first));
}
bool FoundFactor = false;
bool NeedsNegate = false;
for (unsigned i = 0, e = Factors.size(); i != e; ++i) {
if (Factors[i].Op == Factor) {
FoundFactor = true;
Factors.erase(Factors.begin()+i);
break;
}
// If this is a negative version of this factor, remove it.
if (ConstantInt *FC1 = dyn_cast<ConstantInt>(Factor)) {
if (ConstantInt *FC2 = dyn_cast<ConstantInt>(Factors[i].Op))
if (FC1->getValue() == -FC2->getValue()) {
FoundFactor = NeedsNegate = true;
Factors.erase(Factors.begin()+i);
break;
}
} else if (ConstantFP *FC1 = dyn_cast<ConstantFP>(Factor)) {
if (ConstantFP *FC2 = dyn_cast<ConstantFP>(Factors[i].Op)) {
const APFloat &F1 = FC1->getValueAPF();
APFloat F2(FC2->getValueAPF());
F2.changeSign();
if (F1.compare(F2) == APFloat::cmpEqual) {
FoundFactor = NeedsNegate = true;
Factors.erase(Factors.begin() + i);
break;
}
}
}
}
if (!FoundFactor) {
// Make sure to restore the operands to the expression tree.
RewriteExprTree(BO, Factors);
return nullptr;
}
BasicBlock::iterator InsertPt = ++BO->getIterator();
// If this was just a single multiply, remove the multiply and return the only
// remaining operand.
if (Factors.size() == 1) {
RedoInsts.insert(BO);
V = Factors[0].Op;
} else {
RewriteExprTree(BO, Factors);
V = BO;
}
if (NeedsNegate)
V = CreateNeg(V, "neg", &*InsertPt, BO);
return V;
}
/// If V is a single-use multiply, recursively add its operands as factors,
/// otherwise add V to the list of factors.
///
/// Ops is the top-level list of add operands we're trying to factor.
static void FindSingleUseMultiplyFactors(Value *V,
SmallVectorImpl<Value*> &Factors,
const SmallVectorImpl<ValueEntry> &Ops) {
BinaryOperator *BO = isReassociableOp(V, Instruction::Mul, Instruction::FMul);
if (!BO) {
Factors.push_back(V);
return;
}
// Otherwise, add the LHS and RHS to the list of factors.
FindSingleUseMultiplyFactors(BO->getOperand(1), Factors, Ops);
FindSingleUseMultiplyFactors(BO->getOperand(0), Factors, Ops);
}
/// Optimize a series of operands to an 'and', 'or', or 'xor' instruction.
/// This optimizes based on identities. If it can be reduced to a single Value,
/// it is returned, otherwise the Ops list is mutated as necessary.
static Value *OptimizeAndOrXor(unsigned Opcode,
SmallVectorImpl<ValueEntry> &Ops) {
// Scan the operand lists looking for X and ~X pairs, along with X,X pairs.
// If we find any, we can simplify the expression. X&~X == 0, X|~X == -1.
for (unsigned i = 0, e = Ops.size(); i != e; ++i) {
// First, check for X and ~X in the operand list.
assert(i < Ops.size());
if (BinaryOperator::isNot(Ops[i].Op)) { // Cannot occur for ^.
Value *X = BinaryOperator::getNotArgument(Ops[i].Op);
unsigned FoundX = FindInOperandList(Ops, i, X);
if (FoundX != i) {
if (Opcode == Instruction::And) // ...&X&~X = 0
return Constant::getNullValue(X->getType());
if (Opcode == Instruction::Or) // ...|X|~X = -1
return Constant::getAllOnesValue(X->getType());
}
}
// Next, check for duplicate pairs of values, which we assume are next to
// each other, due to our sorting criteria.
assert(i < Ops.size());
if (i+1 != Ops.size() && Ops[i+1].Op == Ops[i].Op) {
if (Opcode == Instruction::And || Opcode == Instruction::Or) {
// Drop duplicate values for And and Or.
Ops.erase(Ops.begin()+i);
--i; --e;
++NumAnnihil;
continue;
}
// Drop pairs of values for Xor.
assert(Opcode == Instruction::Xor);
if (e == 2)
return Constant::getNullValue(Ops[0].Op->getType());
// Y ^ X^X -> Y
Ops.erase(Ops.begin()+i, Ops.begin()+i+2);
i -= 1; e -= 2;
++NumAnnihil;
}
}
return nullptr;
}
/// Helper function of CombineXorOpnd(). It creates a bitwise-and
/// instruction with the given two operands, and return the resulting
/// instruction. There are two special cases: 1) if the constant operand is 0,
/// it will return NULL. 2) if the constant is ~0, the symbolic operand will
/// be returned.
static Value *createAndInstr(Instruction *InsertBefore, Value *Opnd,
const APInt &ConstOpnd) {
if (ConstOpnd != 0) {
if (!ConstOpnd.isAllOnesValue()) {
LLVMContext &Ctx = Opnd->getType()->getContext();
Instruction *I;
I = BinaryOperator::CreateAnd(Opnd, ConstantInt::get(Ctx, ConstOpnd),
"and.ra", InsertBefore);
I->setDebugLoc(InsertBefore->getDebugLoc());
return I;
}
return Opnd;
}
return nullptr;
}
// Helper function of OptimizeXor(). It tries to simplify "Opnd1 ^ ConstOpnd"
// into "R ^ C", where C would be 0, and R is a symbolic value.
//
// If it was successful, true is returned, and the "R" and "C" is returned
// via "Res" and "ConstOpnd", respectively; otherwise, false is returned,
// and both "Res" and "ConstOpnd" remain unchanged.
//
bool ReassociatePass::CombineXorOpnd(Instruction *I, XorOpnd *Opnd1,
APInt &ConstOpnd, Value *&Res) {
// Xor-Rule 1: (x | c1) ^ c2 = (x | c1) ^ (c1 ^ c1) ^ c2
// = ((x | c1) ^ c1) ^ (c1 ^ c2)
// = (x & ~c1) ^ (c1 ^ c2)
// It is useful only when c1 == c2.
if (Opnd1->isOrExpr() && Opnd1->getConstPart() != 0) {
if (!Opnd1->getValue()->hasOneUse())
return false;
const APInt &C1 = Opnd1->getConstPart();
if (C1 != ConstOpnd)
return false;
Value *X = Opnd1->getSymbolicPart();
Res = createAndInstr(I, X, ~C1);
// ConstOpnd was C2, now C1 ^ C2.
ConstOpnd ^= C1;
if (Instruction *T = dyn_cast<Instruction>(Opnd1->getValue()))
RedoInsts.insert(T);
return true;
}
return false;
}
// Helper function of OptimizeXor(). It tries to simplify
// "Opnd1 ^ Opnd2 ^ ConstOpnd" into "R ^ C", where C would be 0, and R is a
// symbolic value.
//
// If it was successful, true is returned, and the "R" and "C" is returned
// via "Res" and "ConstOpnd", respectively (If the entire expression is
// evaluated to a constant, the Res is set to NULL); otherwise, false is
// returned, and both "Res" and "ConstOpnd" remain unchanged.
bool ReassociatePass::CombineXorOpnd(Instruction *I, XorOpnd *Opnd1,
XorOpnd *Opnd2, APInt &ConstOpnd,
Value *&Res) {
Value *X = Opnd1->getSymbolicPart();
if (X != Opnd2->getSymbolicPart())
return false;
// This many instruction become dead.(At least "Opnd1 ^ Opnd2" will die.)
int DeadInstNum = 1;
if (Opnd1->getValue()->hasOneUse())
DeadInstNum++;
if (Opnd2->getValue()->hasOneUse())
DeadInstNum++;
// Xor-Rule 2:
// (x | c1) ^ (x & c2)
// = (x|c1) ^ (x&c2) ^ (c1 ^ c1) = ((x|c1) ^ c1) ^ (x & c2) ^ c1
// = (x & ~c1) ^ (x & c2) ^ c1 // Xor-Rule 1
// = (x & c3) ^ c1, where c3 = ~c1 ^ c2 // Xor-rule 3
//
if (Opnd1->isOrExpr() != Opnd2->isOrExpr()) {
if (Opnd2->isOrExpr())
std::swap(Opnd1, Opnd2);
const APInt &C1 = Opnd1->getConstPart();
const APInt &C2 = Opnd2->getConstPart();
APInt C3((~C1) ^ C2);
// Do not increase code size!
if (C3 != 0 && !C3.isAllOnesValue()) {
int NewInstNum = ConstOpnd != 0 ? 1 : 2;
if (NewInstNum > DeadInstNum)
return false;
}
Res = createAndInstr(I, X, C3);
ConstOpnd ^= C1;
} else if (Opnd1->isOrExpr()) {
// Xor-Rule 3: (x | c1) ^ (x | c2) = (x & c3) ^ c3 where c3 = c1 ^ c2
//
const APInt &C1 = Opnd1->getConstPart();
const APInt &C2 = Opnd2->getConstPart();
APInt C3 = C1 ^ C2;
// Do not increase code size
if (C3 != 0 && !C3.isAllOnesValue()) {
int NewInstNum = ConstOpnd != 0 ? 1 : 2;
if (NewInstNum > DeadInstNum)
return false;
}
Res = createAndInstr(I, X, C3);
ConstOpnd ^= C3;
} else {
// Xor-Rule 4: (x & c1) ^ (x & c2) = (x & (c1^c2))
//
const APInt &C1 = Opnd1->getConstPart();
const APInt &C2 = Opnd2->getConstPart();
APInt C3 = C1 ^ C2;
Res = createAndInstr(I, X, C3);
}
// Put the original operands in the Redo list; hope they will be deleted
// as dead code.
if (Instruction *T = dyn_cast<Instruction>(Opnd1->getValue()))
RedoInsts.insert(T);
if (Instruction *T = dyn_cast<Instruction>(Opnd2->getValue()))
RedoInsts.insert(T);
return true;
}
/// Optimize a series of operands to an 'xor' instruction. If it can be reduced
/// to a single Value, it is returned, otherwise the Ops list is mutated as
/// necessary.
Value *ReassociatePass::OptimizeXor(Instruction *I,
SmallVectorImpl<ValueEntry> &Ops) {
if (Value *V = OptimizeAndOrXor(Instruction::Xor, Ops))
return V;
if (Ops.size() == 1)
return nullptr;
SmallVector<XorOpnd, 8> Opnds;
SmallVector<XorOpnd*, 8> OpndPtrs;
Type *Ty = Ops[0].Op->getType();
APInt ConstOpnd(Ty->getIntegerBitWidth(), 0);
// Step 1: Convert ValueEntry to XorOpnd
for (unsigned i = 0, e = Ops.size(); i != e; ++i) {
Value *V = Ops[i].Op;
if (!isa<ConstantInt>(V)) {
XorOpnd O(V);
O.setSymbolicRank(getRank(O.getSymbolicPart()));
Opnds.push_back(O);
} else
ConstOpnd ^= cast<ConstantInt>(V)->getValue();
}
// NOTE: From this point on, do *NOT* add/delete element to/from "Opnds".
// It would otherwise invalidate the "Opnds"'s iterator, and hence invalidate
// the "OpndPtrs" as well. For the similar reason, do not fuse this loop
// with the previous loop --- the iterator of the "Opnds" may be invalidated
// when new elements are added to the vector.
for (unsigned i = 0, e = Opnds.size(); i != e; ++i)
OpndPtrs.push_back(&Opnds[i]);
// Step 2: Sort the Xor-Operands in a way such that the operands containing
// the same symbolic value cluster together. For instance, the input operand
// sequence ("x | 123", "y & 456", "x & 789") will be sorted into:
// ("x | 123", "x & 789", "y & 456").
//
// The purpose is twofold:
// 1) Cluster together the operands sharing the same symbolic-value.
// 2) Operand having smaller symbolic-value-rank is permuted earlier, which
// could potentially shorten crital path, and expose more loop-invariants.
// Note that values' rank are basically defined in RPO order (FIXME).
// So, if Rank(X) < Rank(Y) < Rank(Z), it means X is defined earlier
// than Y which is defined earlier than Z. Permute "x | 1", "Y & 2",
// "z" in the order of X-Y-Z is better than any other orders.
std::stable_sort(OpndPtrs.begin(), OpndPtrs.end(),
[](XorOpnd *LHS, XorOpnd *RHS) {
return LHS->getSymbolicRank() < RHS->getSymbolicRank();
});
// Step 3: Combine adjacent operands
XorOpnd *PrevOpnd = nullptr;
bool Changed = false;
for (unsigned i = 0, e = Opnds.size(); i < e; i++) {
XorOpnd *CurrOpnd = OpndPtrs[i];
// The combined value
Value *CV;
// Step 3.1: Try simplifying "CurrOpnd ^ ConstOpnd"
if (ConstOpnd != 0 && CombineXorOpnd(I, CurrOpnd, ConstOpnd, CV)) {
Changed = true;
if (CV)
*CurrOpnd = XorOpnd(CV);
else {
CurrOpnd->Invalidate();
continue;
}
}
if (!PrevOpnd || CurrOpnd->getSymbolicPart() != PrevOpnd->getSymbolicPart()) {
PrevOpnd = CurrOpnd;
continue;
}
// step 3.2: When previous and current operands share the same symbolic
// value, try to simplify "PrevOpnd ^ CurrOpnd ^ ConstOpnd"
//
if (CombineXorOpnd(I, CurrOpnd, PrevOpnd, ConstOpnd, CV)) {
// Remove previous operand
PrevOpnd->Invalidate();
if (CV) {
*CurrOpnd = XorOpnd(CV);
PrevOpnd = CurrOpnd;
} else {
CurrOpnd->Invalidate();
PrevOpnd = nullptr;
}
Changed = true;
}
}
// Step 4: Reassemble the Ops
if (Changed) {
Ops.clear();
for (unsigned int i = 0, e = Opnds.size(); i < e; i++) {
XorOpnd &O = Opnds[i];
if (O.isInvalid())
continue;
ValueEntry VE(getRank(O.getValue()), O.getValue());
Ops.push_back(VE);
}
if (ConstOpnd != 0) {
Value *C = ConstantInt::get(Ty->getContext(), ConstOpnd);
ValueEntry VE(getRank(C), C);
Ops.push_back(VE);
}
int Sz = Ops.size();
if (Sz == 1)
return Ops.back().Op;
else if (Sz == 0) {
assert(ConstOpnd == 0);
return ConstantInt::get(Ty->getContext(), ConstOpnd);
}
}
return nullptr;
}
/// Optimize a series of operands to an 'add' instruction. This
/// optimizes based on identities. If it can be reduced to a single Value, it
/// is returned, otherwise the Ops list is mutated as necessary.
Value *ReassociatePass::OptimizeAdd(Instruction *I,
SmallVectorImpl<ValueEntry> &Ops) {
// Scan the operand lists looking for X and -X pairs. If we find any, we
// can simplify expressions like X+-X == 0 and X+~X ==-1. While we're at it,
// scan for any
// duplicates. We want to canonicalize Y+Y+Y+Z -> 3*Y+Z.
for (unsigned i = 0, e = Ops.size(); i != e; ++i) {
Value *TheOp = Ops[i].Op;
// Check to see if we've seen this operand before. If so, we factor all
// instances of the operand together. Due to our sorting criteria, we know
// that these need to be next to each other in the vector.
if (i+1 != Ops.size() && Ops[i+1].Op == TheOp) {
// Rescan the list, remove all instances of this operand from the expr.
unsigned NumFound = 0;
do {
Ops.erase(Ops.begin()+i);
++NumFound;
} while (i != Ops.size() && Ops[i].Op == TheOp);
DEBUG(dbgs() << "\nFACTORING [" << NumFound << "]: " << *TheOp << '\n');
++NumFactor;
// Insert a new multiply.
Type *Ty = TheOp->getType();
Constant *C = Ty->isIntOrIntVectorTy() ?
ConstantInt::get(Ty, NumFound) : ConstantFP::get(Ty, NumFound);
Instruction *Mul = CreateMul(TheOp, C, "factor", I, I);
// Now that we have inserted a multiply, optimize it. This allows us to
// handle cases that require multiple factoring steps, such as this:
// (X*2) + (X*2) + (X*2) -> (X*2)*3 -> X*6
RedoInsts.insert(Mul);
// If every add operand was a duplicate, return the multiply.
if (Ops.empty())
return Mul;
// Otherwise, we had some input that didn't have the dupe, such as
// "A + A + B" -> "A*2 + B". Add the new multiply to the list of
// things being added by this operation.
Ops.insert(Ops.begin(), ValueEntry(getRank(Mul), Mul));
--i;
e = Ops.size();
continue;
}
// Check for X and -X or X and ~X in the operand list.
if (!BinaryOperator::isNeg(TheOp) && !BinaryOperator::isFNeg(TheOp) &&
!BinaryOperator::isNot(TheOp))
continue;
Value *X = nullptr;
if (BinaryOperator::isNeg(TheOp) || BinaryOperator::isFNeg(TheOp))
X = BinaryOperator::getNegArgument(TheOp);
else if (BinaryOperator::isNot(TheOp))
X = BinaryOperator::getNotArgument(TheOp);
unsigned FoundX = FindInOperandList(Ops, i, X);
if (FoundX == i)
continue;
// Remove X and -X from the operand list.
if (Ops.size() == 2 &&
(BinaryOperator::isNeg(TheOp) || BinaryOperator::isFNeg(TheOp)))
return Constant::getNullValue(X->getType());
// Remove X and ~X from the operand list.
if (Ops.size() == 2 && BinaryOperator::isNot(TheOp))
return Constant::getAllOnesValue(X->getType());
Ops.erase(Ops.begin()+i);
if (i < FoundX)
--FoundX;
else
--i; // Need to back up an extra one.
Ops.erase(Ops.begin()+FoundX);
++NumAnnihil;
--i; // Revisit element.
e -= 2; // Removed two elements.
// if X and ~X we append -1 to the operand list.
if (BinaryOperator::isNot(TheOp)) {
Value *V = Constant::getAllOnesValue(X->getType());
Ops.insert(Ops.end(), ValueEntry(getRank(V), V));
e += 1;
}
}
// Scan the operand list, checking to see if there are any common factors
// between operands. Consider something like A*A+A*B*C+D. We would like to
// reassociate this to A*(A+B*C)+D, which reduces the number of multiplies.
// To efficiently find this, we count the number of times a factor occurs
// for any ADD operands that are MULs.
DenseMap<Value*, unsigned> FactorOccurrences;
// Keep track of each multiply we see, to avoid triggering on (X*4)+(X*4)
// where they are actually the same multiply.
unsigned MaxOcc = 0;
Value *MaxOccVal = nullptr;
for (unsigned i = 0, e = Ops.size(); i != e; ++i) {
BinaryOperator *BOp =
isReassociableOp(Ops[i].Op, Instruction::Mul, Instruction::FMul);
if (!BOp)
continue;
// Compute all of the factors of this added value.
SmallVector<Value*, 8> Factors;
FindSingleUseMultiplyFactors(BOp, Factors, Ops);
assert(Factors.size() > 1 && "Bad linearize!");
// Add one to FactorOccurrences for each unique factor in this op.
SmallPtrSet<Value*, 8> Duplicates;
for (unsigned i = 0, e = Factors.size(); i != e; ++i) {
Value *Factor = Factors[i];
if (!Duplicates.insert(Factor).second)
continue;
unsigned Occ = ++FactorOccurrences[Factor];
if (Occ > MaxOcc) {
MaxOcc = Occ;
MaxOccVal = Factor;
}
// If Factor is a negative constant, add the negated value as a factor
// because we can percolate the negate out. Watch for minint, which
// cannot be positivified.
if (ConstantInt *CI = dyn_cast<ConstantInt>(Factor)) {
if (CI->isNegative() && !CI->isMinValue(true)) {
Factor = ConstantInt::get(CI->getContext(), -CI->getValue());
- assert(!Duplicates.count(Factor) &&
- "Shouldn't have two constant factors, missed a canonicalize");
+ if (!Duplicates.insert(Factor).second)
+ continue;
unsigned Occ = ++FactorOccurrences[Factor];
if (Occ > MaxOcc) {
MaxOcc = Occ;
MaxOccVal = Factor;
}
}
} else if (ConstantFP *CF = dyn_cast<ConstantFP>(Factor)) {
if (CF->isNegative()) {
APFloat F(CF->getValueAPF());
F.changeSign();
Factor = ConstantFP::get(CF->getContext(), F);
- assert(!Duplicates.count(Factor) &&
- "Shouldn't have two constant factors, missed a canonicalize");
+ if (!Duplicates.insert(Factor).second)
+ continue;
unsigned Occ = ++FactorOccurrences[Factor];
if (Occ > MaxOcc) {
MaxOcc = Occ;
MaxOccVal = Factor;
}
}
}
}
}
// If any factor occurred more than one time, we can pull it out.
if (MaxOcc > 1) {
DEBUG(dbgs() << "\nFACTORING [" << MaxOcc << "]: " << *MaxOccVal << '\n');
++NumFactor;
// Create a new instruction that uses the MaxOccVal twice. If we don't do
// this, we could otherwise run into situations where removing a factor
// from an expression will drop a use of maxocc, and this can cause
// RemoveFactorFromExpression on successive values to behave differently.
Instruction *DummyInst =
I->getType()->isIntOrIntVectorTy()
? BinaryOperator::CreateAdd(MaxOccVal, MaxOccVal)
: BinaryOperator::CreateFAdd(MaxOccVal, MaxOccVal);
SmallVector<WeakVH, 4> NewMulOps;
for (unsigned i = 0; i != Ops.size(); ++i) {
// Only try to remove factors from expressions we're allowed to.
BinaryOperator *BOp =
isReassociableOp(Ops[i].Op, Instruction::Mul, Instruction::FMul);
if (!BOp)
continue;
if (Value *V = RemoveFactorFromExpression(Ops[i].Op, MaxOccVal)) {
// The factorized operand may occur several times. Convert them all in
// one fell swoop.
for (unsigned j = Ops.size(); j != i;) {
--j;
if (Ops[j].Op == Ops[i].Op) {
NewMulOps.push_back(V);
Ops.erase(Ops.begin()+j);
}
}
--i;
}
}
// No need for extra uses anymore.
delete DummyInst;
unsigned NumAddedValues = NewMulOps.size();
Value *V = EmitAddTreeOfValues(I, NewMulOps);
// Now that we have inserted the add tree, optimize it. This allows us to
// handle cases that require multiple factoring steps, such as this:
// A*A*B + A*A*C --> A*(A*B+A*C) --> A*(A*(B+C))
assert(NumAddedValues > 1 && "Each occurrence should contribute a value");
(void)NumAddedValues;
if (Instruction *VI = dyn_cast<Instruction>(V))
RedoInsts.insert(VI);
// Create the multiply.
Instruction *V2 = CreateMul(V, MaxOccVal, "tmp", I, I);
// Rerun associate on the multiply in case the inner expression turned into
// a multiply. We want to make sure that we keep things in canonical form.
RedoInsts.insert(V2);
// If every add operand included the factor (e.g. "A*B + A*C"), then the
// entire result expression is just the multiply "A*(B+C)".
if (Ops.empty())
return V2;
// Otherwise, we had some input that didn't have the factor, such as
// "A*B + A*C + D" -> "A*(B+C) + D". Add the new multiply to the list of
// things being added by this operation.
Ops.insert(Ops.begin(), ValueEntry(getRank(V2), V2));
}
return nullptr;
}
/// \brief Build up a vector of value/power pairs factoring a product.
///
/// Given a series of multiplication operands, build a vector of factors and
/// the powers each is raised to when forming the final product. Sort them in
/// the order of descending power.
///
/// (x*x) -> [(x, 2)]
/// ((x*x)*x) -> [(x, 3)]
/// ((((x*y)*x)*y)*x) -> [(x, 3), (y, 2)]
///
/// \returns Whether any factors have a power greater than one.
bool ReassociatePass::collectMultiplyFactors(SmallVectorImpl<ValueEntry> &Ops,
SmallVectorImpl<Factor> &Factors) {
// FIXME: Have Ops be (ValueEntry, Multiplicity) pairs, simplifying this.
// Compute the sum of powers of simplifiable factors.
unsigned FactorPowerSum = 0;
for (unsigned Idx = 1, Size = Ops.size(); Idx < Size; ++Idx) {
Value *Op = Ops[Idx-1].Op;
// Count the number of occurrences of this value.
unsigned Count = 1;
for (; Idx < Size && Ops[Idx].Op == Op; ++Idx)
++Count;
// Track for simplification all factors which occur 2 or more times.
if (Count > 1)
FactorPowerSum += Count;
}
// We can only simplify factors if the sum of the powers of our simplifiable
// factors is 4 or higher. When that is the case, we will *always* have
// a simplification. This is an important invariant to prevent cyclicly
// trying to simplify already minimal formations.
if (FactorPowerSum < 4)
return false;
// Now gather the simplifiable factors, removing them from Ops.
FactorPowerSum = 0;
for (unsigned Idx = 1; Idx < Ops.size(); ++Idx) {
Value *Op = Ops[Idx-1].Op;
// Count the number of occurrences of this value.
unsigned Count = 1;
for (; Idx < Ops.size() && Ops[Idx].Op == Op; ++Idx)
++Count;
if (Count == 1)
continue;
// Move an even number of occurrences to Factors.
Count &= ~1U;
Idx -= Count;
FactorPowerSum += Count;
Factors.push_back(Factor(Op, Count));
Ops.erase(Ops.begin()+Idx, Ops.begin()+Idx+Count);
}
// None of the adjustments above should have reduced the sum of factor powers
// below our mininum of '4'.
assert(FactorPowerSum >= 4);
std::stable_sort(Factors.begin(), Factors.end(),
[](const Factor &LHS, const Factor &RHS) {
return LHS.Power > RHS.Power;
});
return true;
}
/// \brief Build a tree of multiplies, computing the product of Ops.
static Value *buildMultiplyTree(IRBuilder<> &Builder,
SmallVectorImpl<Value*> &Ops) {
if (Ops.size() == 1)
return Ops.back();
Value *LHS = Ops.pop_back_val();
do {
if (LHS->getType()->isIntOrIntVectorTy())
LHS = Builder.CreateMul(LHS, Ops.pop_back_val());
else
LHS = Builder.CreateFMul(LHS, Ops.pop_back_val());
} while (!Ops.empty());
return LHS;
}
/// \brief Build a minimal multiplication DAG for (a^x)*(b^y)*(c^z)*...
///
/// Given a vector of values raised to various powers, where no two values are
/// equal and the powers are sorted in decreasing order, compute the minimal
/// DAG of multiplies to compute the final product, and return that product
/// value.
Value *
ReassociatePass::buildMinimalMultiplyDAG(IRBuilder<> &Builder,
SmallVectorImpl<Factor> &Factors) {
assert(Factors[0].Power);
SmallVector<Value *, 4> OuterProduct;
for (unsigned LastIdx = 0, Idx = 1, Size = Factors.size();
Idx < Size && Factors[Idx].Power > 0; ++Idx) {
if (Factors[Idx].Power != Factors[LastIdx].Power) {
LastIdx = Idx;
continue;
}
// We want to multiply across all the factors with the same power so that
// we can raise them to that power as a single entity. Build a mini tree
// for that.
SmallVector<Value *, 4> InnerProduct;
InnerProduct.push_back(Factors[LastIdx].Base);
do {
InnerProduct.push_back(Factors[Idx].Base);
++Idx;
} while (Idx < Size && Factors[Idx].Power == Factors[LastIdx].Power);
// Reset the base value of the first factor to the new expression tree.
// We'll remove all the factors with the same power in a second pass.
Value *M = Factors[LastIdx].Base = buildMultiplyTree(Builder, InnerProduct);
if (Instruction *MI = dyn_cast<Instruction>(M))
RedoInsts.insert(MI);
LastIdx = Idx;
}
// Unique factors with equal powers -- we've folded them into the first one's
// base.
Factors.erase(std::unique(Factors.begin(), Factors.end(),
[](const Factor &LHS, const Factor &RHS) {
return LHS.Power == RHS.Power;
}),
Factors.end());
// Iteratively collect the base of each factor with an add power into the
// outer product, and halve each power in preparation for squaring the
// expression.
for (unsigned Idx = 0, Size = Factors.size(); Idx != Size; ++Idx) {
if (Factors[Idx].Power & 1)
OuterProduct.push_back(Factors[Idx].Base);
Factors[Idx].Power >>= 1;
}
if (Factors[0].Power) {
Value *SquareRoot = buildMinimalMultiplyDAG(Builder, Factors);
OuterProduct.push_back(SquareRoot);
OuterProduct.push_back(SquareRoot);
}
if (OuterProduct.size() == 1)
return OuterProduct.front();
Value *V = buildMultiplyTree(Builder, OuterProduct);
return V;
}
Value *ReassociatePass::OptimizeMul(BinaryOperator *I,
SmallVectorImpl<ValueEntry> &Ops) {
// We can only optimize the multiplies when there is a chain of more than
// three, such that a balanced tree might require fewer total multiplies.
if (Ops.size() < 4)
return nullptr;
// Try to turn linear trees of multiplies without other uses of the
// intermediate stages into minimal multiply DAGs with perfect sub-expression
// re-use.
SmallVector<Factor, 4> Factors;
if (!collectMultiplyFactors(Ops, Factors))
return nullptr; // All distinct factors, so nothing left for us to do.
IRBuilder<> Builder(I);
// The reassociate transformation for FP operations is performed only
// if unsafe algebra is permitted by FastMathFlags. Propagate those flags
// to the newly generated operations.
if (auto FPI = dyn_cast<FPMathOperator>(I))
Builder.setFastMathFlags(FPI->getFastMathFlags());
Value *V = buildMinimalMultiplyDAG(Builder, Factors);
if (Ops.empty())
return V;
ValueEntry NewEntry = ValueEntry(getRank(V), V);
Ops.insert(std::lower_bound(Ops.begin(), Ops.end(), NewEntry), NewEntry);
return nullptr;
}
Value *ReassociatePass::OptimizeExpression(BinaryOperator *I,
SmallVectorImpl<ValueEntry> &Ops) {
// Now that we have the linearized expression tree, try to optimize it.
// Start by folding any constants that we found.
Constant *Cst = nullptr;
unsigned Opcode = I->getOpcode();
while (!Ops.empty() && isa<Constant>(Ops.back().Op)) {
Constant *C = cast<Constant>(Ops.pop_back_val().Op);
Cst = Cst ? ConstantExpr::get(Opcode, C, Cst) : C;
}
// If there was nothing but constants then we are done.
if (Ops.empty())
return Cst;
// Put the combined constant back at the end of the operand list, except if
// there is no point. For example, an add of 0 gets dropped here, while a
// multiplication by zero turns the whole expression into zero.
if (Cst && Cst != ConstantExpr::getBinOpIdentity(Opcode, I->getType())) {
if (Cst == ConstantExpr::getBinOpAbsorber(Opcode, I->getType()))
return Cst;
Ops.push_back(ValueEntry(0, Cst));
}
if (Ops.size() == 1) return Ops[0].Op;
// Handle destructive annihilation due to identities between elements in the
// argument list here.
unsigned NumOps = Ops.size();
switch (Opcode) {
default: break;
case Instruction::And:
case Instruction::Or:
if (Value *Result = OptimizeAndOrXor(Opcode, Ops))
return Result;
break;
case Instruction::Xor:
if (Value *Result = OptimizeXor(I, Ops))
return Result;
break;
case Instruction::Add:
case Instruction::FAdd:
if (Value *Result = OptimizeAdd(I, Ops))
return Result;
break;
case Instruction::Mul:
case Instruction::FMul:
if (Value *Result = OptimizeMul(I, Ops))
return Result;
break;
}
if (Ops.size() != NumOps)
return OptimizeExpression(I, Ops);
return nullptr;
}
// Remove dead instructions and if any operands are trivially dead add them to
// Insts so they will be removed as well.
void ReassociatePass::RecursivelyEraseDeadInsts(
Instruction *I, SetVector<AssertingVH<Instruction>> &Insts) {
assert(isInstructionTriviallyDead(I) && "Trivially dead instructions only!");
SmallVector<Value *, 4> Ops(I->op_begin(), I->op_end());
ValueRankMap.erase(I);
Insts.remove(I);
RedoInsts.remove(I);
I->eraseFromParent();
for (auto Op : Ops)
if (Instruction *OpInst = dyn_cast<Instruction>(Op))
if (OpInst->use_empty())
Insts.insert(OpInst);
}
/// Zap the given instruction, adding interesting operands to the work list.
void ReassociatePass::EraseInst(Instruction *I) {
assert(isInstructionTriviallyDead(I) && "Trivially dead instructions only!");
DEBUG(dbgs() << "Erasing dead inst: "; I->dump());
SmallVector<Value*, 8> Ops(I->op_begin(), I->op_end());
// Erase the dead instruction.
ValueRankMap.erase(I);
RedoInsts.remove(I);
I->eraseFromParent();
// Optimize its operands.
SmallPtrSet<Instruction *, 8> Visited; // Detect self-referential nodes.
for (unsigned i = 0, e = Ops.size(); i != e; ++i)
if (Instruction *Op = dyn_cast<Instruction>(Ops[i])) {
// If this is a node in an expression tree, climb to the expression root
// and add that since that's where optimization actually happens.
unsigned Opcode = Op->getOpcode();
while (Op->hasOneUse() && Op->user_back()->getOpcode() == Opcode &&
Visited.insert(Op).second)
Op = Op->user_back();
RedoInsts.insert(Op);
}
}
// Canonicalize expressions of the following form:
// x + (-Constant * y) -> x - (Constant * y)
// x - (-Constant * y) -> x + (Constant * y)
Instruction *ReassociatePass::canonicalizeNegConstExpr(Instruction *I) {
if (!I->hasOneUse() || I->getType()->isVectorTy())
return nullptr;
// Must be a fmul or fdiv instruction.
unsigned Opcode = I->getOpcode();
if (Opcode != Instruction::FMul && Opcode != Instruction::FDiv)
return nullptr;
auto *C0 = dyn_cast<ConstantFP>(I->getOperand(0));
auto *C1 = dyn_cast<ConstantFP>(I->getOperand(1));
// Both operands are constant, let it get constant folded away.
if (C0 && C1)
return nullptr;
ConstantFP *CF = C0 ? C0 : C1;
// Must have one constant operand.
if (!CF)
return nullptr;
// Must be a negative ConstantFP.
if (!CF->isNegative())
return nullptr;
// User must be a binary operator with one or more uses.
Instruction *User = I->user_back();
if (!isa<BinaryOperator>(User) || !User->hasNUsesOrMore(1))
return nullptr;
unsigned UserOpcode = User->getOpcode();
if (UserOpcode != Instruction::FAdd && UserOpcode != Instruction::FSub)
return nullptr;
// Subtraction is not commutative. Explicitly, the following transform is
// not valid: (-Constant * y) - x -> x + (Constant * y)
if (!User->isCommutative() && User->getOperand(1) != I)
return nullptr;
// Change the sign of the constant.
APFloat Val = CF->getValueAPF();
Val.changeSign();
I->setOperand(C0 ? 0 : 1, ConstantFP::get(CF->getContext(), Val));
// Canonicalize I to RHS to simplify the next bit of logic. E.g.,
// ((-Const*y) + x) -> (x + (-Const*y)).
if (User->getOperand(0) == I && User->isCommutative())
cast<BinaryOperator>(User)->swapOperands();
Value *Op0 = User->getOperand(0);
Value *Op1 = User->getOperand(1);
BinaryOperator *NI;
switch (UserOpcode) {
default:
llvm_unreachable("Unexpected Opcode!");
case Instruction::FAdd:
NI = BinaryOperator::CreateFSub(Op0, Op1);
NI->setFastMathFlags(cast<FPMathOperator>(User)->getFastMathFlags());
break;
case Instruction::FSub:
NI = BinaryOperator::CreateFAdd(Op0, Op1);
NI->setFastMathFlags(cast<FPMathOperator>(User)->getFastMathFlags());
break;
}
NI->insertBefore(User);
NI->setName(User->getName());
User->replaceAllUsesWith(NI);
NI->setDebugLoc(I->getDebugLoc());
RedoInsts.insert(I);
MadeChange = true;
return NI;
}
/// Inspect and optimize the given instruction. Note that erasing
/// instructions is not allowed.
void ReassociatePass::OptimizeInst(Instruction *I) {
// Only consider operations that we understand.
if (!isa<BinaryOperator>(I))
return;
if (I->getOpcode() == Instruction::Shl && isa<ConstantInt>(I->getOperand(1)))
// If an operand of this shift is a reassociable multiply, or if the shift
// is used by a reassociable multiply or add, turn into a multiply.
if (isReassociableOp(I->getOperand(0), Instruction::Mul) ||
(I->hasOneUse() &&
(isReassociableOp(I->user_back(), Instruction::Mul) ||
isReassociableOp(I->user_back(), Instruction::Add)))) {
Instruction *NI = ConvertShiftToMul(I);
RedoInsts.insert(I);
MadeChange = true;
I = NI;
}
// Canonicalize negative constants out of expressions.
if (Instruction *Res = canonicalizeNegConstExpr(I))
I = Res;
// Commute binary operators, to canonicalize the order of their operands.
// This can potentially expose more CSE opportunities, and makes writing other
// transformations simpler.
if (I->isCommutative())
canonicalizeOperands(I);
// TODO: We should optimize vector Xor instructions, but they are
// currently unsupported.
if (I->getType()->isVectorTy() && I->getOpcode() == Instruction::Xor)
return;
// Don't optimize floating point instructions that don't have unsafe algebra.
if (I->getType()->isFPOrFPVectorTy() && !I->hasUnsafeAlgebra())
return;
// Do not reassociate boolean (i1) expressions. We want to preserve the
// original order of evaluation for short-circuited comparisons that
// SimplifyCFG has folded to AND/OR expressions. If the expression
// is not further optimized, it is likely to be transformed back to a
// short-circuited form for code gen, and the source order may have been
// optimized for the most likely conditions.
if (I->getType()->isIntegerTy(1))
return;
// If this is a subtract instruction which is not already in negate form,
// see if we can convert it to X+-Y.
if (I->getOpcode() == Instruction::Sub) {
if (ShouldBreakUpSubtract(I)) {
Instruction *NI = BreakUpSubtract(I, RedoInsts);
RedoInsts.insert(I);
MadeChange = true;
I = NI;
} else if (BinaryOperator::isNeg(I)) {
// Otherwise, this is a negation. See if the operand is a multiply tree
// and if this is not an inner node of a multiply tree.
if (isReassociableOp(I->getOperand(1), Instruction::Mul) &&
(!I->hasOneUse() ||
!isReassociableOp(I->user_back(), Instruction::Mul))) {
Instruction *NI = LowerNegateToMultiply(I);
// If the negate was simplified, revisit the users to see if we can
// reassociate further.
for (User *U : NI->users()) {
if (BinaryOperator *Tmp = dyn_cast<BinaryOperator>(U))
RedoInsts.insert(Tmp);
}
RedoInsts.insert(I);
MadeChange = true;
I = NI;
}
}
} else if (I->getOpcode() == Instruction::FSub) {
if (ShouldBreakUpSubtract(I)) {
Instruction *NI = BreakUpSubtract(I, RedoInsts);
RedoInsts.insert(I);
MadeChange = true;
I = NI;
} else if (BinaryOperator::isFNeg(I)) {
// Otherwise, this is a negation. See if the operand is a multiply tree
// and if this is not an inner node of a multiply tree.
if (isReassociableOp(I->getOperand(1), Instruction::FMul) &&
(!I->hasOneUse() ||
!isReassociableOp(I->user_back(), Instruction::FMul))) {
// If the negate was simplified, revisit the users to see if we can
// reassociate further.
Instruction *NI = LowerNegateToMultiply(I);
for (User *U : NI->users()) {
if (BinaryOperator *Tmp = dyn_cast<BinaryOperator>(U))
RedoInsts.insert(Tmp);
}
RedoInsts.insert(I);
MadeChange = true;
I = NI;
}
}
}
// If this instruction is an associative binary operator, process it.
if (!I->isAssociative()) return;
BinaryOperator *BO = cast<BinaryOperator>(I);
// If this is an interior node of a reassociable tree, ignore it until we
// get to the root of the tree, to avoid N^2 analysis.
unsigned Opcode = BO->getOpcode();
if (BO->hasOneUse() && BO->user_back()->getOpcode() == Opcode) {
// During the initial run we will get to the root of the tree.
// But if we get here while we are redoing instructions, there is no
// guarantee that the root will be visited. So Redo later
if (BO->user_back() != BO &&
BO->getParent() == BO->user_back()->getParent())
RedoInsts.insert(BO->user_back());
return;
}
// If this is an add tree that is used by a sub instruction, ignore it
// until we process the subtract.
if (BO->hasOneUse() && BO->getOpcode() == Instruction::Add &&
cast<Instruction>(BO->user_back())->getOpcode() == Instruction::Sub)
return;
if (BO->hasOneUse() && BO->getOpcode() == Instruction::FAdd &&
cast<Instruction>(BO->user_back())->getOpcode() == Instruction::FSub)
return;
ReassociateExpression(BO);
}
void ReassociatePass::ReassociateExpression(BinaryOperator *I) {
// First, walk the expression tree, linearizing the tree, collecting the
// operand information.
SmallVector<RepeatedValue, 8> Tree;
MadeChange |= LinearizeExprTree(I, Tree);
SmallVector<ValueEntry, 8> Ops;
Ops.reserve(Tree.size());
for (unsigned i = 0, e = Tree.size(); i != e; ++i) {
RepeatedValue E = Tree[i];
Ops.append(E.second.getZExtValue(),
ValueEntry(getRank(E.first), E.first));
}
DEBUG(dbgs() << "RAIn:\t"; PrintOps(I, Ops); dbgs() << '\n');
// Now that we have linearized the tree to a list and have gathered all of
// the operands and their ranks, sort the operands by their rank. Use a
// stable_sort so that values with equal ranks will have their relative
// positions maintained (and so the compiler is deterministic). Note that
// this sorts so that the highest ranking values end up at the beginning of
// the vector.
std::stable_sort(Ops.begin(), Ops.end());
// Now that we have the expression tree in a convenient
// sorted form, optimize it globally if possible.
if (Value *V = OptimizeExpression(I, Ops)) {
if (V == I)
// Self-referential expression in unreachable code.
return;
// This expression tree simplified to something that isn't a tree,
// eliminate it.
DEBUG(dbgs() << "Reassoc to scalar: " << *V << '\n');
I->replaceAllUsesWith(V);
if (Instruction *VI = dyn_cast<Instruction>(V))
VI->setDebugLoc(I->getDebugLoc());
RedoInsts.insert(I);
++NumAnnihil;
return;
}
// We want to sink immediates as deeply as possible except in the case where
// this is a multiply tree used only by an add, and the immediate is a -1.
// In this case we reassociate to put the negation on the outside so that we
// can fold the negation into the add: (-X)*Y + Z -> Z-X*Y
if (I->hasOneUse()) {
if (I->getOpcode() == Instruction::Mul &&
cast<Instruction>(I->user_back())->getOpcode() == Instruction::Add &&
isa<ConstantInt>(Ops.back().Op) &&
cast<ConstantInt>(Ops.back().Op)->isAllOnesValue()) {
ValueEntry Tmp = Ops.pop_back_val();
Ops.insert(Ops.begin(), Tmp);
} else if (I->getOpcode() == Instruction::FMul &&
cast<Instruction>(I->user_back())->getOpcode() ==
Instruction::FAdd &&
isa<ConstantFP>(Ops.back().Op) &&
cast<ConstantFP>(Ops.back().Op)->isExactlyValue(-1.0)) {
ValueEntry Tmp = Ops.pop_back_val();
Ops.insert(Ops.begin(), Tmp);
}
}
DEBUG(dbgs() << "RAOut:\t"; PrintOps(I, Ops); dbgs() << '\n');
if (Ops.size() == 1) {
if (Ops[0].Op == I)
// Self-referential expression in unreachable code.
return;
// This expression tree simplified to something that isn't a tree,
// eliminate it.
I->replaceAllUsesWith(Ops[0].Op);
if (Instruction *OI = dyn_cast<Instruction>(Ops[0].Op))
OI->setDebugLoc(I->getDebugLoc());
RedoInsts.insert(I);
return;
}
// Now that we ordered and optimized the expressions, splat them back into
// the expression tree, removing any unneeded nodes.
RewriteExprTree(I, Ops);
}
PreservedAnalyses ReassociatePass::run(Function &F, FunctionAnalysisManager &) {
// Get the functions basic blocks in Reverse Post Order. This order is used by
// BuildRankMap to pre calculate ranks correctly. It also excludes dead basic
// blocks (it has been seen that the analysis in this pass could hang when
// analysing dead basic blocks).
ReversePostOrderTraversal<Function *> RPOT(&F);
// Calculate the rank map for F.
BuildRankMap(F, RPOT);
MadeChange = false;
// Traverse the same blocks that was analysed by BuildRankMap.
for (BasicBlock *BI : RPOT) {
assert(RankMap.count(&*BI) && "BB should be ranked.");
// Optimize every instruction in the basic block.
for (BasicBlock::iterator II = BI->begin(), IE = BI->end(); II != IE;)
if (isInstructionTriviallyDead(&*II)) {
EraseInst(&*II++);
} else {
OptimizeInst(&*II);
assert(II->getParent() == &*BI && "Moved to a different block!");
++II;
}
// Make a copy of all the instructions to be redone so we can remove dead
// instructions.
SetVector<AssertingVH<Instruction>> ToRedo(RedoInsts);
// Iterate over all instructions to be reevaluated and remove trivially dead
// instructions. If any operand of the trivially dead instruction becomes
// dead mark it for deletion as well. Continue this process until all
// trivially dead instructions have been removed.
while (!ToRedo.empty()) {
Instruction *I = ToRedo.pop_back_val();
if (isInstructionTriviallyDead(I)) {
RecursivelyEraseDeadInsts(I, ToRedo);
MadeChange = true;
}
}
// Now that we have removed dead instructions, we can reoptimize the
// remaining instructions.
while (!RedoInsts.empty()) {
Instruction *I = RedoInsts.pop_back_val();
if (isInstructionTriviallyDead(I))
EraseInst(I);
else
OptimizeInst(I);
}
}
// We are done with the rank map.
RankMap.clear();
ValueRankMap.clear();
if (MadeChange) {
// FIXME: This should also 'preserve the CFG'.
auto PA = PreservedAnalyses();
PA.preserve<GlobalsAA>();
return PA;
}
return PreservedAnalyses::all();
}
namespace {
class ReassociateLegacyPass : public FunctionPass {
ReassociatePass Impl;
public:
static char ID; // Pass identification, replacement for typeid
ReassociateLegacyPass() : FunctionPass(ID) {
initializeReassociateLegacyPassPass(*PassRegistry::getPassRegistry());
}
bool runOnFunction(Function &F) override {
if (skipFunction(F))
return false;
FunctionAnalysisManager DummyFAM;
auto PA = Impl.run(F, DummyFAM);
return !PA.areAllPreserved();
}
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesCFG();
AU.addPreserved<GlobalsAAWrapperPass>();
}
};
}
char ReassociateLegacyPass::ID = 0;
INITIALIZE_PASS(ReassociateLegacyPass, "reassociate",
"Reassociate expressions", false, false)
// Public interface to the Reassociate pass
FunctionPass *llvm::createReassociatePass() {
return new ReassociateLegacyPass();
}
Index: projects/clang400-import/contrib/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
===================================================================
--- projects/clang400-import/contrib/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp (revision 314268)
+++ projects/clang400-import/contrib/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp (revision 314269)
@@ -1,4945 +1,4854 @@
//===- SLPVectorizer.cpp - A bottom up SLP Vectorizer ---------------------===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
// This pass implements the Bottom Up SLP vectorizer. It detects consecutive
// stores that can be put together into vector-stores. Next, it attempts to
// construct vectorizable tree using the use-def chains. If a profitable tree
// was found, the SLP vectorizer performs vectorization on the tree.
//
// The pass is inspired by the work described in the paper:
// "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks.
//
//===----------------------------------------------------------------------===//
#include "llvm/Transforms/Vectorize/SLPVectorizer.h"
#include "llvm/ADT/Optional.h"
#include "llvm/ADT/PostOrderIterator.h"
#include "llvm/ADT/SetVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/CodeMetrics.h"
#include "llvm/Analysis/GlobalsModRef.h"
#include "llvm/Analysis/LoopAccessAnalysis.h"
#include "llvm/Analysis/ScalarEvolutionExpressions.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/Analysis/VectorUtils.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/Dominators.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/NoFolder.h"
#include "llvm/IR/Type.h"
#include "llvm/IR/Value.h"
#include "llvm/IR/Verifier.h"
#include "llvm/Pass.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Vectorize.h"
#include <algorithm>
#include <memory>
using namespace llvm;
using namespace slpvectorizer;
#define SV_NAME "slp-vectorizer"
#define DEBUG_TYPE "SLP"
STATISTIC(NumVectorInstructions, "Number of vector instructions generated");
static cl::opt<int>
SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden,
cl::desc("Only vectorize if you gain more than this "
"number "));
static cl::opt<bool>
ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden,
cl::desc("Attempt to vectorize horizontal reductions"));
static cl::opt<bool> ShouldStartVectorizeHorAtStore(
"slp-vectorize-hor-store", cl::init(false), cl::Hidden,
cl::desc(
"Attempt to vectorize horizontal reductions feeding into a store"));
static cl::opt<int>
MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden,
cl::desc("Attempt to vectorize for this register size in bits"));
/// Limits the size of scheduling regions in a block.
/// It avoid long compile times for _very_ large blocks where vector
/// instructions are spread over a wide range.
/// This limit is way higher than needed by real-world functions.
static cl::opt<int>
ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden,
cl::desc("Limit the size of the SLP scheduling region per block"));
static cl::opt<int> MinVectorRegSizeOption(
"slp-min-reg-size", cl::init(128), cl::Hidden,
cl::desc("Attempt to vectorize for this register size in bits"));
static cl::opt<unsigned> RecursionMaxDepth(
"slp-recursion-max-depth", cl::init(12), cl::Hidden,
cl::desc("Limit the recursion depth when building a vectorizable tree"));
static cl::opt<unsigned> MinTreeSize(
"slp-min-tree-size", cl::init(3), cl::Hidden,
cl::desc("Only vectorize small trees if they are fully vectorizable"));
// Limit the number of alias checks. The limit is chosen so that
// it has no negative effect on the llvm benchmarks.
static const unsigned AliasedCheckLimit = 10;
// Another limit for the alias checks: The maximum distance between load/store
// instructions where alias checks are done.
// This limit is useful for very large basic blocks.
static const unsigned MaxMemDepDistance = 160;
/// If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling
/// regions to be handled.
static const int MinScheduleRegionSize = 16;
/// \brief Predicate for the element types that the SLP vectorizer supports.
///
/// The most important thing to filter here are types which are invalid in LLVM
/// vectors. We also filter target specific types which have absolutely no
/// meaningful vectorization path such as x86_fp80 and ppc_f128. This just
/// avoids spending time checking the cost model and realizing that they will
/// be inevitably scalarized.
static bool isValidElementType(Type *Ty) {
return VectorType::isValidElementType(Ty) && !Ty->isX86_FP80Ty() &&
!Ty->isPPC_FP128Ty();
}
/// \returns true if all of the instructions in \p VL are in the same block or
/// false otherwise.
static bool allSameBlock(ArrayRef<Value *> VL) {
Instruction *I0 = dyn_cast<Instruction>(VL[0]);
if (!I0)
return false;
BasicBlock *BB = I0->getParent();
for (int i = 1, e = VL.size(); i < e; i++) {
Instruction *I = dyn_cast<Instruction>(VL[i]);
if (!I)
return false;
if (BB != I->getParent())
return false;
}
return true;
}
/// \returns True if all of the values in \p VL are constants.
static bool allConstant(ArrayRef<Value *> VL) {
for (Value *i : VL)
if (!isa<Constant>(i))
return false;
return true;
}
/// \returns True if all of the values in \p VL are identical.
static bool isSplat(ArrayRef<Value *> VL) {
for (unsigned i = 1, e = VL.size(); i < e; ++i)
if (VL[i] != VL[0])
return false;
return true;
}
///\returns Opcode that can be clubbed with \p Op to create an alternate
/// sequence which can later be merged as a ShuffleVector instruction.
static unsigned getAltOpcode(unsigned Op) {
switch (Op) {
case Instruction::FAdd:
return Instruction::FSub;
case Instruction::FSub:
return Instruction::FAdd;
case Instruction::Add:
return Instruction::Sub;
case Instruction::Sub:
return Instruction::Add;
default:
return 0;
}
}
///\returns bool representing if Opcode \p Op can be part
/// of an alternate sequence which can later be merged as
/// a ShuffleVector instruction.
static bool canCombineAsAltInst(unsigned Op) {
return Op == Instruction::FAdd || Op == Instruction::FSub ||
Op == Instruction::Sub || Op == Instruction::Add;
}
/// \returns ShuffleVector instruction if instructions in \p VL have
/// alternate fadd,fsub / fsub,fadd/add,sub/sub,add sequence.
/// (i.e. e.g. opcodes of fadd,fsub,fadd,fsub...)
static unsigned isAltInst(ArrayRef<Value *> VL) {
Instruction *I0 = dyn_cast<Instruction>(VL[0]);
unsigned Opcode = I0->getOpcode();
unsigned AltOpcode = getAltOpcode(Opcode);
for (int i = 1, e = VL.size(); i < e; i++) {
Instruction *I = dyn_cast<Instruction>(VL[i]);
if (!I || I->getOpcode() != ((i & 1) ? AltOpcode : Opcode))
return 0;
}
return Instruction::ShuffleVector;
}
/// \returns The opcode if all of the Instructions in \p VL have the same
/// opcode, or zero.
static unsigned getSameOpcode(ArrayRef<Value *> VL) {
Instruction *I0 = dyn_cast<Instruction>(VL[0]);
if (!I0)
return 0;
unsigned Opcode = I0->getOpcode();
for (int i = 1, e = VL.size(); i < e; i++) {
Instruction *I = dyn_cast<Instruction>(VL[i]);
if (!I || Opcode != I->getOpcode()) {
if (canCombineAsAltInst(Opcode) && i == 1)
return isAltInst(VL);
return 0;
}
}
return Opcode;
}
/// Get the intersection (logical and) of all of the potential IR flags
/// of each scalar operation (VL) that will be converted into a vector (I).
/// Flag set: NSW, NUW, exact, and all of fast-math.
static void propagateIRFlags(Value *I, ArrayRef<Value *> VL) {
if (auto *VecOp = dyn_cast<Instruction>(I)) {
if (auto *Intersection = dyn_cast<Instruction>(VL[0])) {
// Intersection is initialized to the 0th scalar,
// so start counting from index '1'.
for (int i = 1, e = VL.size(); i < e; ++i) {
if (auto *Scalar = dyn_cast<Instruction>(VL[i]))
Intersection->andIRFlags(Scalar);
}
VecOp->copyIRFlags(Intersection);
}
}
}
/// \returns true if all of the values in \p VL have the same type or false
/// otherwise.
static bool allSameType(ArrayRef<Value *> VL) {
Type *Ty = VL[0]->getType();
for (int i = 1, e = VL.size(); i < e; i++)
if (VL[i]->getType() != Ty)
return false;
return true;
}
/// \returns True if Extract{Value,Element} instruction extracts element Idx.
static bool matchExtractIndex(Instruction *E, unsigned Idx, unsigned Opcode) {
assert(Opcode == Instruction::ExtractElement ||
Opcode == Instruction::ExtractValue);
if (Opcode == Instruction::ExtractElement) {
ConstantInt *CI = dyn_cast<ConstantInt>(E->getOperand(1));
return CI && CI->getZExtValue() == Idx;
} else {
ExtractValueInst *EI = cast<ExtractValueInst>(E);
return EI->getNumIndices() == 1 && *EI->idx_begin() == Idx;
}
}
/// \returns True if in-tree use also needs extract. This refers to
/// possible scalar operand in vectorized instruction.
static bool InTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,
TargetLibraryInfo *TLI) {
unsigned Opcode = UserInst->getOpcode();
switch (Opcode) {
case Instruction::Load: {
LoadInst *LI = cast<LoadInst>(UserInst);
return (LI->getPointerOperand() == Scalar);
}
case Instruction::Store: {
StoreInst *SI = cast<StoreInst>(UserInst);
return (SI->getPointerOperand() == Scalar);
}
case Instruction::Call: {
CallInst *CI = cast<CallInst>(UserInst);
Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
if (hasVectorInstrinsicScalarOpd(ID, 1)) {
return (CI->getArgOperand(1) == Scalar);
}
}
default:
return false;
}
}
/// \returns the AA location that is being access by the instruction.
static MemoryLocation getLocation(Instruction *I, AliasAnalysis *AA) {
if (StoreInst *SI = dyn_cast<StoreInst>(I))
return MemoryLocation::get(SI);
if (LoadInst *LI = dyn_cast<LoadInst>(I))
return MemoryLocation::get(LI);
return MemoryLocation();
}
/// \returns True if the instruction is not a volatile or atomic load/store.
static bool isSimple(Instruction *I) {
if (LoadInst *LI = dyn_cast<LoadInst>(I))
return LI->isSimple();
if (StoreInst *SI = dyn_cast<StoreInst>(I))
return SI->isSimple();
if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I))
return !MI->isVolatile();
return true;
}
namespace llvm {
namespace slpvectorizer {
/// Bottom Up SLP Vectorizer.
class BoUpSLP {
public:
typedef SmallVector<Value *, 8> ValueList;
typedef SmallVector<Instruction *, 16> InstrList;
typedef SmallPtrSet<Value *, 16> ValueSet;
typedef SmallVector<StoreInst *, 8> StoreList;
BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti,
TargetLibraryInfo *TLi, AliasAnalysis *Aa, LoopInfo *Li,
DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB,
const DataLayout *DL)
: NumLoadsWantToKeepOrder(0), NumLoadsWantToChangeOrder(0), F(Func),
SE(Se), TTI(Tti), TLI(TLi), AA(Aa), LI(Li), DT(Dt), AC(AC), DB(DB),
DL(DL), Builder(Se->getContext()) {
CodeMetrics::collectEphemeralValues(F, AC, EphValues);
// Use the vector register size specified by the target unless overridden
// by a command-line option.
// TODO: It would be better to limit the vectorization factor based on
// data type rather than just register size. For example, x86 AVX has
// 256-bit registers, but it does not support integer operations
// at that width (that requires AVX2).
if (MaxVectorRegSizeOption.getNumOccurrences())
MaxVecRegSize = MaxVectorRegSizeOption;
else
MaxVecRegSize = TTI->getRegisterBitWidth(true);
MinVecRegSize = MinVectorRegSizeOption;
}
/// \brief Vectorize the tree that starts with the elements in \p VL.
/// Returns the vectorized root.
Value *vectorizeTree();
/// \returns the cost incurred by unwanted spills and fills, caused by
/// holding live values over call sites.
int getSpillCost();
/// \returns the vectorization cost of the subtree that starts at \p VL.
/// A negative number means that this is profitable.
int getTreeCost();
/// Construct a vectorizable tree that starts at \p Roots, ignoring users for
/// the purpose of scheduling and extraction in the \p UserIgnoreLst.
void buildTree(ArrayRef<Value *> Roots,
ArrayRef<Value *> UserIgnoreLst = None);
/// Clear the internal data structures that are created by 'buildTree'.
void deleteTree() {
VectorizableTree.clear();
ScalarToTreeEntry.clear();
MustGather.clear();
ExternalUses.clear();
NumLoadsWantToKeepOrder = 0;
NumLoadsWantToChangeOrder = 0;
for (auto &Iter : BlocksSchedules) {
BlockScheduling *BS = Iter.second.get();
BS->clear();
}
MinBWs.clear();
}
/// \brief Perform LICM and CSE on the newly generated gather sequences.
void optimizeGatherSequence();
/// \returns true if it is beneficial to reverse the vector order.
bool shouldReorder() const {
return NumLoadsWantToChangeOrder > NumLoadsWantToKeepOrder;
}
/// \return The vector element size in bits to use when vectorizing the
/// expression tree ending at \p V. If V is a store, the size is the width of
/// the stored value. Otherwise, the size is the width of the largest loaded
/// value reaching V. This method is used by the vectorizer to calculate
/// vectorization factors.
unsigned getVectorElementSize(Value *V);
/// Compute the minimum type sizes required to represent the entries in a
/// vectorizable tree.
void computeMinimumValueSizes();
// \returns maximum vector register size as set by TTI or overridden by cl::opt.
unsigned getMaxVecRegSize() const {
return MaxVecRegSize;
}
// \returns minimum vector register size as set by cl::opt.
unsigned getMinVecRegSize() const {
return MinVecRegSize;
}
/// \brief Check if ArrayType or StructType is isomorphic to some VectorType.
///
/// \returns number of elements in vector if isomorphism exists, 0 otherwise.
unsigned canMapToVector(Type *T, const DataLayout &DL) const;
/// \returns True if the VectorizableTree is both tiny and not fully
/// vectorizable. We do not vectorize such trees.
bool isTreeTinyAndNotFullyVectorizable();
private:
struct TreeEntry;
/// \returns the cost of the vectorizable entry.
int getEntryCost(TreeEntry *E);
/// This is the recursive part of buildTree.
void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth);
/// \returns True if the ExtractElement/ExtractValue instructions in VL can
/// be vectorized to use the original vector (or aggregate "bitcast" to a vector).
bool canReuseExtract(ArrayRef<Value *> VL, unsigned Opcode) const;
/// Vectorize a single entry in the tree.
Value *vectorizeTree(TreeEntry *E);
/// Vectorize a single entry in the tree, starting in \p VL.
Value *vectorizeTree(ArrayRef<Value *> VL);
/// \returns the pointer to the vectorized value if \p VL is already
/// vectorized, or NULL. They may happen in cycles.
Value *alreadyVectorized(ArrayRef<Value *> VL) const;
/// \returns the scalarization cost for this type. Scalarization in this
/// context means the creation of vectors from a group of scalars.
int getGatherCost(Type *Ty);
/// \returns the scalarization cost for this list of values. Assuming that
/// this subtree gets vectorized, we may need to extract the values from the
/// roots. This method calculates the cost of extracting the values.
int getGatherCost(ArrayRef<Value *> VL);
/// \brief Set the Builder insert point to one after the last instruction in
/// the bundle
void setInsertPointAfterBundle(ArrayRef<Value *> VL);
/// \returns a vector from a collection of scalars in \p VL.
Value *Gather(ArrayRef<Value *> VL, VectorType *Ty);
/// \returns whether the VectorizableTree is fully vectorizable and will
/// be beneficial even the tree height is tiny.
bool isFullyVectorizableTinyTree();
/// \reorder commutative operands in alt shuffle if they result in
/// vectorized code.
void reorderAltShuffleOperands(ArrayRef<Value *> VL,
SmallVectorImpl<Value *> &Left,
SmallVectorImpl<Value *> &Right);
/// \reorder commutative operands to get better probability of
/// generating vectorized code.
void reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
SmallVectorImpl<Value *> &Left,
SmallVectorImpl<Value *> &Right);
struct TreeEntry {
TreeEntry() : Scalars(), VectorizedValue(nullptr),
NeedToGather(0) {}
/// \returns true if the scalars in VL are equal to this entry.
bool isSame(ArrayRef<Value *> VL) const {
assert(VL.size() == Scalars.size() && "Invalid size");
return std::equal(VL.begin(), VL.end(), Scalars.begin());
}
/// A vector of scalars.
ValueList Scalars;
/// The Scalars are vectorized into this value. It is initialized to Null.
Value *VectorizedValue;
/// Do we need to gather this sequence ?
bool NeedToGather;
};
/// Create a new VectorizableTree entry.
TreeEntry *newTreeEntry(ArrayRef<Value *> VL, bool Vectorized) {
VectorizableTree.emplace_back();
int idx = VectorizableTree.size() - 1;
TreeEntry *Last = &VectorizableTree[idx];
Last->Scalars.insert(Last->Scalars.begin(), VL.begin(), VL.end());
Last->NeedToGather = !Vectorized;
if (Vectorized) {
for (int i = 0, e = VL.size(); i != e; ++i) {
assert(!ScalarToTreeEntry.count(VL[i]) && "Scalar already in tree!");
ScalarToTreeEntry[VL[i]] = idx;
}
} else {
MustGather.insert(VL.begin(), VL.end());
}
return Last;
}
/// -- Vectorization State --
/// Holds all of the tree entries.
std::vector<TreeEntry> VectorizableTree;
/// Maps a specific scalar to its tree entry.
SmallDenseMap<Value*, int> ScalarToTreeEntry;
/// A list of scalars that we found that we need to keep as scalars.
ValueSet MustGather;
/// This POD struct describes one external user in the vectorized tree.
struct ExternalUser {
ExternalUser (Value *S, llvm::User *U, int L) :
Scalar(S), User(U), Lane(L){}
// Which scalar in our function.
Value *Scalar;
// Which user that uses the scalar.
llvm::User *User;
// Which lane does the scalar belong to.
int Lane;
};
typedef SmallVector<ExternalUser, 16> UserList;
/// Checks if two instructions may access the same memory.
///
/// \p Loc1 is the location of \p Inst1. It is passed explicitly because it
/// is invariant in the calling loop.
bool isAliased(const MemoryLocation &Loc1, Instruction *Inst1,
Instruction *Inst2) {
// First check if the result is already in the cache.
AliasCacheKey key = std::make_pair(Inst1, Inst2);
Optional<bool> &result = AliasCache[key];
if (result.hasValue()) {
return result.getValue();
}
MemoryLocation Loc2 = getLocation(Inst2, AA);
bool aliased = true;
if (Loc1.Ptr && Loc2.Ptr && isSimple(Inst1) && isSimple(Inst2)) {
// Do the alias check.
aliased = AA->alias(Loc1, Loc2);
}
// Store the result in the cache.
result = aliased;
return aliased;
}
typedef std::pair<Instruction *, Instruction *> AliasCacheKey;
/// Cache for alias results.
/// TODO: consider moving this to the AliasAnalysis itself.
DenseMap<AliasCacheKey, Optional<bool>> AliasCache;
/// Removes an instruction from its block and eventually deletes it.
/// It's like Instruction::eraseFromParent() except that the actual deletion
/// is delayed until BoUpSLP is destructed.
/// This is required to ensure that there are no incorrect collisions in the
/// AliasCache, which can happen if a new instruction is allocated at the
/// same address as a previously deleted instruction.
void eraseInstruction(Instruction *I) {
I->removeFromParent();
I->dropAllReferences();
DeletedInstructions.push_back(std::unique_ptr<Instruction>(I));
}
/// Temporary store for deleted instructions. Instructions will be deleted
/// eventually when the BoUpSLP is destructed.
SmallVector<std::unique_ptr<Instruction>, 8> DeletedInstructions;
/// A list of values that need to extracted out of the tree.
/// This list holds pairs of (Internal Scalar : External User).
UserList ExternalUses;
/// Values used only by @llvm.assume calls.
SmallPtrSet<const Value *, 32> EphValues;
/// Holds all of the instructions that we gathered.
SetVector<Instruction *> GatherSeq;
/// A list of blocks that we are going to CSE.
SetVector<BasicBlock *> CSEBlocks;
/// Contains all scheduling relevant data for an instruction.
/// A ScheduleData either represents a single instruction or a member of an
/// instruction bundle (= a group of instructions which is combined into a
/// vector instruction).
struct ScheduleData {
// The initial value for the dependency counters. It means that the
// dependencies are not calculated yet.
enum { InvalidDeps = -1 };
ScheduleData()
: Inst(nullptr), FirstInBundle(nullptr), NextInBundle(nullptr),
NextLoadStore(nullptr), SchedulingRegionID(0), SchedulingPriority(0),
Dependencies(InvalidDeps), UnscheduledDeps(InvalidDeps),
UnscheduledDepsInBundle(InvalidDeps), IsScheduled(false) {}
void init(int BlockSchedulingRegionID) {
FirstInBundle = this;
NextInBundle = nullptr;
NextLoadStore = nullptr;
IsScheduled = false;
SchedulingRegionID = BlockSchedulingRegionID;
UnscheduledDepsInBundle = UnscheduledDeps;
clearDependencies();
}
/// Returns true if the dependency information has been calculated.
bool hasValidDependencies() const { return Dependencies != InvalidDeps; }
/// Returns true for single instructions and for bundle representatives
/// (= the head of a bundle).
bool isSchedulingEntity() const { return FirstInBundle == this; }
/// Returns true if it represents an instruction bundle and not only a
/// single instruction.
bool isPartOfBundle() const {
return NextInBundle != nullptr || FirstInBundle != this;
}
/// Returns true if it is ready for scheduling, i.e. it has no more
/// unscheduled depending instructions/bundles.
bool isReady() const {
assert(isSchedulingEntity() &&
"can't consider non-scheduling entity for ready list");
return UnscheduledDepsInBundle == 0 && !IsScheduled;
}
/// Modifies the number of unscheduled dependencies, also updating it for
/// the whole bundle.
int incrementUnscheduledDeps(int Incr) {
UnscheduledDeps += Incr;
return FirstInBundle->UnscheduledDepsInBundle += Incr;
}
/// Sets the number of unscheduled dependencies to the number of
/// dependencies.
void resetUnscheduledDeps() {
incrementUnscheduledDeps(Dependencies - UnscheduledDeps);
}
/// Clears all dependency information.
void clearDependencies() {
Dependencies = InvalidDeps;
resetUnscheduledDeps();
MemoryDependencies.clear();
}
void dump(raw_ostream &os) const {
if (!isSchedulingEntity()) {
os << "/ " << *Inst;
} else if (NextInBundle) {
os << '[' << *Inst;
ScheduleData *SD = NextInBundle;
while (SD) {
os << ';' << *SD->Inst;
SD = SD->NextInBundle;
}
os << ']';
} else {
os << *Inst;
}
}
Instruction *Inst;
/// Points to the head in an instruction bundle (and always to this for
/// single instructions).
ScheduleData *FirstInBundle;
/// Single linked list of all instructions in a bundle. Null if it is a
/// single instruction.
ScheduleData *NextInBundle;
/// Single linked list of all memory instructions (e.g. load, store, call)
/// in the block - until the end of the scheduling region.
ScheduleData *NextLoadStore;
/// The dependent memory instructions.
/// This list is derived on demand in calculateDependencies().
SmallVector<ScheduleData *, 4> MemoryDependencies;
/// This ScheduleData is in the current scheduling region if this matches
/// the current SchedulingRegionID of BlockScheduling.
int SchedulingRegionID;
/// Used for getting a "good" final ordering of instructions.
int SchedulingPriority;
/// The number of dependencies. Constitutes of the number of users of the
/// instruction plus the number of dependent memory instructions (if any).
/// This value is calculated on demand.
/// If InvalidDeps, the number of dependencies is not calculated yet.
///
int Dependencies;
/// The number of dependencies minus the number of dependencies of scheduled
/// instructions. As soon as this is zero, the instruction/bundle gets ready
/// for scheduling.
/// Note that this is negative as long as Dependencies is not calculated.
int UnscheduledDeps;
/// The sum of UnscheduledDeps in a bundle. Equals to UnscheduledDeps for
/// single instructions.
int UnscheduledDepsInBundle;
/// True if this instruction is scheduled (or considered as scheduled in the
/// dry-run).
bool IsScheduled;
};
#ifndef NDEBUG
friend inline raw_ostream &operator<<(raw_ostream &os,
const BoUpSLP::ScheduleData &SD) {
SD.dump(os);
return os;
}
#endif
/// Contains all scheduling data for a basic block.
///
struct BlockScheduling {
BlockScheduling(BasicBlock *BB)
: BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize),
ScheduleStart(nullptr), ScheduleEnd(nullptr),
FirstLoadStoreInRegion(nullptr), LastLoadStoreInRegion(nullptr),
ScheduleRegionSize(0),
ScheduleRegionSizeLimit(ScheduleRegionSizeBudget),
// Make sure that the initial SchedulingRegionID is greater than the
// initial SchedulingRegionID in ScheduleData (which is 0).
SchedulingRegionID(1) {}
void clear() {
ReadyInsts.clear();
ScheduleStart = nullptr;
ScheduleEnd = nullptr;
FirstLoadStoreInRegion = nullptr;
LastLoadStoreInRegion = nullptr;
// Reduce the maximum schedule region size by the size of the
// previous scheduling run.
ScheduleRegionSizeLimit -= ScheduleRegionSize;
if (ScheduleRegionSizeLimit < MinScheduleRegionSize)
ScheduleRegionSizeLimit = MinScheduleRegionSize;
ScheduleRegionSize = 0;
// Make a new scheduling region, i.e. all existing ScheduleData is not
// in the new region yet.
++SchedulingRegionID;
}
ScheduleData *getScheduleData(Value *V) {
ScheduleData *SD = ScheduleDataMap[V];
if (SD && SD->SchedulingRegionID == SchedulingRegionID)
return SD;
return nullptr;
}
bool isInSchedulingRegion(ScheduleData *SD) {
return SD->SchedulingRegionID == SchedulingRegionID;
}
/// Marks an instruction as scheduled and puts all dependent ready
/// instructions into the ready-list.
template <typename ReadyListType>
void schedule(ScheduleData *SD, ReadyListType &ReadyList) {
SD->IsScheduled = true;
DEBUG(dbgs() << "SLP: schedule " << *SD << "\n");
ScheduleData *BundleMember = SD;
while (BundleMember) {
// Handle the def-use chain dependencies.
for (Use &U : BundleMember->Inst->operands()) {
ScheduleData *OpDef = getScheduleData(U.get());
if (OpDef && OpDef->hasValidDependencies() &&
OpDef->incrementUnscheduledDeps(-1) == 0) {
// There are no more unscheduled dependencies after decrementing,
// so we can put the dependent instruction into the ready list.
ScheduleData *DepBundle = OpDef->FirstInBundle;
assert(!DepBundle->IsScheduled &&
"already scheduled bundle gets ready");
ReadyList.insert(DepBundle);
DEBUG(dbgs() << "SLP: gets ready (def): " << *DepBundle << "\n");
}
}
// Handle the memory dependencies.
for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) {
if (MemoryDepSD->incrementUnscheduledDeps(-1) == 0) {
// There are no more unscheduled dependencies after decrementing,
// so we can put the dependent instruction into the ready list.
ScheduleData *DepBundle = MemoryDepSD->FirstInBundle;
assert(!DepBundle->IsScheduled &&
"already scheduled bundle gets ready");
ReadyList.insert(DepBundle);
DEBUG(dbgs() << "SLP: gets ready (mem): " << *DepBundle << "\n");
}
}
BundleMember = BundleMember->NextInBundle;
}
}
/// Put all instructions into the ReadyList which are ready for scheduling.
template <typename ReadyListType>
void initialFillReadyList(ReadyListType &ReadyList) {
for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
ScheduleData *SD = getScheduleData(I);
if (SD->isSchedulingEntity() && SD->isReady()) {
ReadyList.insert(SD);
DEBUG(dbgs() << "SLP: initially in ready list: " << *I << "\n");
}
}
}
/// Checks if a bundle of instructions can be scheduled, i.e. has no
/// cyclic dependencies. This is only a dry-run, no instructions are
/// actually moved at this stage.
bool tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP);
/// Un-bundles a group of instructions.
void cancelScheduling(ArrayRef<Value *> VL);
/// Extends the scheduling region so that V is inside the region.
/// \returns true if the region size is within the limit.
bool extendSchedulingRegion(Value *V);
/// Initialize the ScheduleData structures for new instructions in the
/// scheduling region.
void initScheduleData(Instruction *FromI, Instruction *ToI,
ScheduleData *PrevLoadStore,
ScheduleData *NextLoadStore);
/// Updates the dependency information of a bundle and of all instructions/
/// bundles which depend on the original bundle.
void calculateDependencies(ScheduleData *SD, bool InsertInReadyList,
BoUpSLP *SLP);
/// Sets all instruction in the scheduling region to un-scheduled.
void resetSchedule();
BasicBlock *BB;
/// Simple memory allocation for ScheduleData.
std::vector<std::unique_ptr<ScheduleData[]>> ScheduleDataChunks;
/// The size of a ScheduleData array in ScheduleDataChunks.
int ChunkSize;
/// The allocator position in the current chunk, which is the last entry
/// of ScheduleDataChunks.
int ChunkPos;
/// Attaches ScheduleData to Instruction.
/// Note that the mapping survives during all vectorization iterations, i.e.
/// ScheduleData structures are recycled.
DenseMap<Value *, ScheduleData *> ScheduleDataMap;
struct ReadyList : SmallVector<ScheduleData *, 8> {
void insert(ScheduleData *SD) { push_back(SD); }
};
/// The ready-list for scheduling (only used for the dry-run).
ReadyList ReadyInsts;
/// The first instruction of the scheduling region.
Instruction *ScheduleStart;
/// The first instruction _after_ the scheduling region.
Instruction *ScheduleEnd;
/// The first memory accessing instruction in the scheduling region
/// (can be null).
ScheduleData *FirstLoadStoreInRegion;
/// The last memory accessing instruction in the scheduling region
/// (can be null).
ScheduleData *LastLoadStoreInRegion;
/// The current size of the scheduling region.
int ScheduleRegionSize;
/// The maximum size allowed for the scheduling region.
int ScheduleRegionSizeLimit;
/// The ID of the scheduling region. For a new vectorization iteration this
/// is incremented which "removes" all ScheduleData from the region.
int SchedulingRegionID;
};
/// Attaches the BlockScheduling structures to basic blocks.
MapVector<BasicBlock *, std::unique_ptr<BlockScheduling>> BlocksSchedules;
/// Performs the "real" scheduling. Done before vectorization is actually
/// performed in a basic block.
void scheduleBlock(BlockScheduling *BS);
/// List of users to ignore during scheduling and that don't need extracting.
ArrayRef<Value *> UserIgnoreList;
// Number of load bundles that contain consecutive loads.
int NumLoadsWantToKeepOrder;
// Number of load bundles that contain consecutive loads in reversed order.
int NumLoadsWantToChangeOrder;
// Analysis and block reference.
Function *F;
ScalarEvolution *SE;
TargetTransformInfo *TTI;
TargetLibraryInfo *TLI;
AliasAnalysis *AA;
LoopInfo *LI;
DominatorTree *DT;
AssumptionCache *AC;
DemandedBits *DB;
const DataLayout *DL;
unsigned MaxVecRegSize; // This is set by TTI or overridden by cl::opt.
unsigned MinVecRegSize; // Set by cl::opt (default: 128).
/// Instruction builder to construct the vectorized tree.
IRBuilder<> Builder;
/// A map of scalar integer values to the smallest bit width with which they
/// can legally be represented. The values map to (width, signed) pairs,
/// where "width" indicates the minimum bit width and "signed" is True if the
/// value must be signed-extended, rather than zero-extended, back to its
/// original width.
MapVector<Value *, std::pair<uint64_t, bool>> MinBWs;
};
} // end namespace llvm
} // end namespace slpvectorizer
void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
ArrayRef<Value *> UserIgnoreLst) {
deleteTree();
UserIgnoreList = UserIgnoreLst;
if (!allSameType(Roots))
return;
buildTree_rec(Roots, 0);
// Collect the values that we need to extract from the tree.
for (TreeEntry &EIdx : VectorizableTree) {
TreeEntry *Entry = &EIdx;
// For each lane:
for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
Value *Scalar = Entry->Scalars[Lane];
// No need to handle users of gathered values.
if (Entry->NeedToGather)
continue;
for (User *U : Scalar->users()) {
DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n");
Instruction *UserInst = dyn_cast<Instruction>(U);
if (!UserInst)
continue;
// Skip in-tree scalars that become vectors
if (ScalarToTreeEntry.count(U)) {
int Idx = ScalarToTreeEntry[U];
TreeEntry *UseEntry = &VectorizableTree[Idx];
Value *UseScalar = UseEntry->Scalars[0];
// Some in-tree scalars will remain as scalar in vectorized
// instructions. If that is the case, the one in Lane 0 will
// be used.
if (UseScalar != U ||
!InTreeUserNeedToExtract(Scalar, UserInst, TLI)) {
DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U
<< ".\n");
assert(!VectorizableTree[Idx].NeedToGather && "Bad state");
continue;
}
}
// Ignore users in the user ignore list.
if (is_contained(UserIgnoreList, UserInst))
continue;
DEBUG(dbgs() << "SLP: Need to extract:" << *U << " from lane " <<
Lane << " from " << *Scalar << ".\n");
ExternalUses.push_back(ExternalUser(Scalar, U, Lane));
}
}
}
}
void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
bool isAltShuffle = false;
assert((allConstant(VL) || allSameType(VL)) && "Invalid types!");
if (Depth == RecursionMaxDepth) {
DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n");
newTreeEntry(VL, false);
return;
}
// Don't handle vectors.
if (VL[0]->getType()->isVectorTy()) {
DEBUG(dbgs() << "SLP: Gathering due to vector type.\n");
newTreeEntry(VL, false);
return;
}
if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
if (SI->getValueOperand()->getType()->isVectorTy()) {
DEBUG(dbgs() << "SLP: Gathering due to store vector type.\n");
newTreeEntry(VL, false);
return;
}
unsigned Opcode = getSameOpcode(VL);
// Check that this shuffle vector refers to the alternate
// sequence of opcodes.
if (Opcode == Instruction::ShuffleVector) {
Instruction *I0 = dyn_cast<Instruction>(VL[0]);
unsigned Op = I0->getOpcode();
if (Op != Instruction::ShuffleVector)
isAltShuffle = true;
}
// If all of the operands are identical or constant we have a simple solution.
if (allConstant(VL) || isSplat(VL) || !allSameBlock(VL) || !Opcode) {
DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n");
newTreeEntry(VL, false);
return;
}
// We now know that this is a vector of instructions of the same type from
// the same block.
// Don't vectorize ephemeral values.
for (unsigned i = 0, e = VL.size(); i != e; ++i) {
if (EphValues.count(VL[i])) {
DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] <<
") is ephemeral.\n");
newTreeEntry(VL, false);
return;
}
}
// Check if this is a duplicate of another entry.
if (ScalarToTreeEntry.count(VL[0])) {
int Idx = ScalarToTreeEntry[VL[0]];
TreeEntry *E = &VectorizableTree[Idx];
for (unsigned i = 0, e = VL.size(); i != e; ++i) {
DEBUG(dbgs() << "SLP: \tChecking bundle: " << *VL[i] << ".\n");
if (E->Scalars[i] != VL[i]) {
DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n");
newTreeEntry(VL, false);
return;
}
}
DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *VL[0] << ".\n");
return;
}
// Check that none of the instructions in the bundle are already in the tree.
for (unsigned i = 0, e = VL.size(); i != e; ++i) {
if (ScalarToTreeEntry.count(VL[i])) {
DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] <<
") is already in tree.\n");
newTreeEntry(VL, false);
return;
}
}
// If any of the scalars is marked as a value that needs to stay scalar then
// we need to gather the scalars.
for (unsigned i = 0, e = VL.size(); i != e; ++i) {
if (MustGather.count(VL[i])) {
DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n");
newTreeEntry(VL, false);
return;
}
}
// Check that all of the users of the scalars that we want to vectorize are
// schedulable.
Instruction *VL0 = cast<Instruction>(VL[0]);
BasicBlock *BB = cast<Instruction>(VL0)->getParent();
if (!DT->isReachableFromEntry(BB)) {
// Don't go into unreachable blocks. They may contain instructions with
// dependency cycles which confuse the final scheduling.
DEBUG(dbgs() << "SLP: bundle in unreachable block.\n");
newTreeEntry(VL, false);
return;
}
// Check that every instructions appears once in this bundle.
for (unsigned i = 0, e = VL.size(); i < e; ++i)
for (unsigned j = i+1; j < e; ++j)
if (VL[i] == VL[j]) {
DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
newTreeEntry(VL, false);
return;
}
auto &BSRef = BlocksSchedules[BB];
if (!BSRef) {
BSRef = llvm::make_unique<BlockScheduling>(BB);
}
BlockScheduling &BS = *BSRef.get();
if (!BS.tryScheduleBundle(VL, this)) {
DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n");
assert((!BS.getScheduleData(VL[0]) ||
!BS.getScheduleData(VL[0])->isPartOfBundle()) &&
"tryScheduleBundle should cancelScheduling on failure");
newTreeEntry(VL, false);
return;
}
DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n");
switch (Opcode) {
case Instruction::PHI: {
PHINode *PH = dyn_cast<PHINode>(VL0);
// Check for terminator values (e.g. invoke).
for (unsigned j = 0; j < VL.size(); ++j)
for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
TerminatorInst *Term = dyn_cast<TerminatorInst>(
cast<PHINode>(VL[j])->getIncomingValueForBlock(PH->getIncomingBlock(i)));
if (Term) {
DEBUG(dbgs() << "SLP: Need to swizzle PHINodes (TerminatorInst use).\n");
BS.cancelScheduling(VL);
newTreeEntry(VL, false);
return;
}
}
newTreeEntry(VL, true);
DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n");
for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
ValueList Operands;
// Prepare the operand vector.
for (Value *j : VL)
Operands.push_back(cast<PHINode>(j)->getIncomingValueForBlock(
PH->getIncomingBlock(i)));
buildTree_rec(Operands, Depth + 1);
}
return;
}
case Instruction::ExtractValue:
case Instruction::ExtractElement: {
bool Reuse = canReuseExtract(VL, Opcode);
if (Reuse) {
DEBUG(dbgs() << "SLP: Reusing extract sequence.\n");
} else {
BS.cancelScheduling(VL);
}
newTreeEntry(VL, Reuse);
return;
}
case Instruction::Load: {
// Check that a vectorized load would load the same memory as a scalar
// load.
// For example we don't want vectorize loads that are smaller than 8 bit.
// Even though we have a packed struct {<i2, i2, i2, i2>} LLVM treats
// loading/storing it as an i8 struct. If we vectorize loads/stores from
// such a struct we read/write packed bits disagreeing with the
// unvectorized version.
Type *ScalarTy = VL[0]->getType();
if (DL->getTypeSizeInBits(ScalarTy) !=
DL->getTypeAllocSizeInBits(ScalarTy)) {
BS.cancelScheduling(VL);
newTreeEntry(VL, false);
DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n");
return;
}
// Make sure all loads in the bundle are simple - we can't vectorize
// atomic or volatile loads.
for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) {
LoadInst *L = cast<LoadInst>(VL[i]);
if (!L->isSimple()) {
BS.cancelScheduling(VL);
newTreeEntry(VL, false);
DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n");
return;
}
}
// Check if the loads are consecutive, reversed, or neither.
// TODO: What we really want is to sort the loads, but for now, check
// the two likely directions.
bool Consecutive = true;
bool ReverseConsecutive = true;
for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) {
if (!isConsecutiveAccess(VL[i], VL[i + 1], *DL, *SE)) {
Consecutive = false;
break;
} else {
ReverseConsecutive = false;
}
}
if (Consecutive) {
++NumLoadsWantToKeepOrder;
newTreeEntry(VL, true);
DEBUG(dbgs() << "SLP: added a vector of loads.\n");
return;
}
// If none of the load pairs were consecutive when checked in order,
// check the reverse order.
if (ReverseConsecutive)
for (unsigned i = VL.size() - 1; i > 0; --i)
if (!isConsecutiveAccess(VL[i], VL[i - 1], *DL, *SE)) {
ReverseConsecutive = false;
break;
}
BS.cancelScheduling(VL);
newTreeEntry(VL, false);
if (ReverseConsecutive) {
++NumLoadsWantToChangeOrder;
DEBUG(dbgs() << "SLP: Gathering reversed loads.\n");
} else {
DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n");
}
return;
}
case Instruction::ZExt:
case Instruction::SExt:
case Instruction::FPToUI:
case Instruction::FPToSI:
case Instruction::FPExt:
case Instruction::PtrToInt:
case Instruction::IntToPtr:
case Instruction::SIToFP:
case Instruction::UIToFP:
case Instruction::Trunc:
case Instruction::FPTrunc:
case Instruction::BitCast: {
Type *SrcTy = VL0->getOperand(0)->getType();
for (unsigned i = 0; i < VL.size(); ++i) {
Type *Ty = cast<Instruction>(VL[i])->getOperand(0)->getType();
if (Ty != SrcTy || !isValidElementType(Ty)) {
BS.cancelScheduling(VL);
newTreeEntry(VL, false);
DEBUG(dbgs() << "SLP: Gathering casts with different src types.\n");
return;
}
}
newTreeEntry(VL, true);
DEBUG(dbgs() << "SLP: added a vector of casts.\n");
for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
ValueList Operands;
// Prepare the operand vector.
for (Value *j : VL)
Operands.push_back(cast<Instruction>(j)->getOperand(i));
buildTree_rec(Operands, Depth+1);
}
return;
}
case Instruction::ICmp:
case Instruction::FCmp: {
// Check that all of the compares have the same predicate.
CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
Type *ComparedTy = cast<Instruction>(VL[0])->getOperand(0)->getType();
for (unsigned i = 1, e = VL.size(); i < e; ++i) {
CmpInst *Cmp = cast<CmpInst>(VL[i]);
if (Cmp->getPredicate() != P0 ||
Cmp->getOperand(0)->getType() != ComparedTy) {
BS.cancelScheduling(VL);
newTreeEntry(VL, false);
DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n");
return;
}
}
newTreeEntry(VL, true);
DEBUG(dbgs() << "SLP: added a vector of compares.\n");
for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
ValueList Operands;
// Prepare the operand vector.
for (Value *j : VL)
Operands.push_back(cast<Instruction>(j)->getOperand(i));
buildTree_rec(Operands, Depth+1);
}
return;
}
case Instruction::Select:
case Instruction::Add:
case Instruction::FAdd:
case Instruction::Sub:
case Instruction::FSub:
case Instruction::Mul:
case Instruction::FMul:
case Instruction::UDiv:
case Instruction::SDiv:
case Instruction::FDiv:
case Instruction::URem:
case Instruction::SRem:
case Instruction::FRem:
case Instruction::Shl:
case Instruction::LShr:
case Instruction::AShr:
case Instruction::And:
case Instruction::Or:
case Instruction::Xor: {
newTreeEntry(VL, true);
DEBUG(dbgs() << "SLP: added a vector of bin op.\n");
// Sort operands of the instructions so that each side is more likely to
// have the same opcode.
if (isa<BinaryOperator>(VL0) && VL0->isCommutative()) {
ValueList Left, Right;
reorderInputsAccordingToOpcode(VL, Left, Right);
buildTree_rec(Left, Depth + 1);
buildTree_rec(Right, Depth + 1);
return;
}
for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
ValueList Operands;
// Prepare the operand vector.
for (Value *j : VL)
Operands.push_back(cast<Instruction>(j)->getOperand(i));
buildTree_rec(Operands, Depth+1);
}
return;
}
case Instruction::GetElementPtr: {
// We don't combine GEPs with complicated (nested) indexing.
for (unsigned j = 0; j < VL.size(); ++j) {
if (cast<Instruction>(VL[j])->getNumOperands() != 2) {
DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n");
BS.cancelScheduling(VL);
newTreeEntry(VL, false);
return;
}
}
// We can't combine several GEPs into one vector if they operate on
// different types.
Type *Ty0 = cast<Instruction>(VL0)->getOperand(0)->getType();
for (unsigned j = 0; j < VL.size(); ++j) {
Type *CurTy = cast<Instruction>(VL[j])->getOperand(0)->getType();
if (Ty0 != CurTy) {
DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n");
BS.cancelScheduling(VL);
newTreeEntry(VL, false);
return;
}
}
// We don't combine GEPs with non-constant indexes.
for (unsigned j = 0; j < VL.size(); ++j) {
auto Op = cast<Instruction>(VL[j])->getOperand(1);
if (!isa<ConstantInt>(Op)) {
DEBUG(
dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n");
BS.cancelScheduling(VL);
newTreeEntry(VL, false);
return;
}
}
newTreeEntry(VL, true);
DEBUG(dbgs() << "SLP: added a vector of GEPs.\n");
for (unsigned i = 0, e = 2; i < e; ++i) {
ValueList Operands;
// Prepare the operand vector.
for (Value *j : VL)
Operands.push_back(cast<Instruction>(j)->getOperand(i));
buildTree_rec(Operands, Depth + 1);
}
return;
}
case Instruction::Store: {
// Check if the stores are consecutive or of we need to swizzle them.
for (unsigned i = 0, e = VL.size() - 1; i < e; ++i)
if (!isConsecutiveAccess(VL[i], VL[i + 1], *DL, *SE)) {
BS.cancelScheduling(VL);
newTreeEntry(VL, false);
DEBUG(dbgs() << "SLP: Non-consecutive store.\n");
return;
}
newTreeEntry(VL, true);
DEBUG(dbgs() << "SLP: added a vector of stores.\n");
ValueList Operands;
for (Value *j : VL)
Operands.push_back(cast<Instruction>(j)->getOperand(0));
buildTree_rec(Operands, Depth + 1);
return;
}
case Instruction::Call: {
// Check if the calls are all to the same vectorizable intrinsic.
CallInst *CI = cast<CallInst>(VL[0]);
// Check if this is an Intrinsic call or something that can be
// represented by an intrinsic call
Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
if (!isTriviallyVectorizable(ID)) {
BS.cancelScheduling(VL);
newTreeEntry(VL, false);
DEBUG(dbgs() << "SLP: Non-vectorizable call.\n");
return;
}
Function *Int = CI->getCalledFunction();
Value *A1I = nullptr;
if (hasVectorInstrinsicScalarOpd(ID, 1))
A1I = CI->getArgOperand(1);
for (unsigned i = 1, e = VL.size(); i != e; ++i) {
CallInst *CI2 = dyn_cast<CallInst>(VL[i]);
if (!CI2 || CI2->getCalledFunction() != Int ||
getVectorIntrinsicIDForCall(CI2, TLI) != ID ||
!CI->hasIdenticalOperandBundleSchema(*CI2)) {
BS.cancelScheduling(VL);
newTreeEntry(VL, false);
DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *VL[i]
<< "\n");
return;
}
// ctlz,cttz and powi are special intrinsics whose second argument
// should be same in order for them to be vectorized.
if (hasVectorInstrinsicScalarOpd(ID, 1)) {
Value *A1J = CI2->getArgOperand(1);
if (A1I != A1J) {
BS.cancelScheduling(VL);
newTreeEntry(VL, false);
DEBUG(dbgs() << "SLP: mismatched arguments in call:" << *CI
<< " argument "<< A1I<<"!=" << A1J
<< "\n");
return;
}
}
// Verify that the bundle operands are identical between the two calls.
if (CI->hasOperandBundles() &&
!std::equal(CI->op_begin() + CI->getBundleOperandsStartIndex(),
CI->op_begin() + CI->getBundleOperandsEndIndex(),
CI2->op_begin() + CI2->getBundleOperandsStartIndex())) {
BS.cancelScheduling(VL);
newTreeEntry(VL, false);
DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:" << *CI << "!="
<< *VL[i] << '\n');
return;
}
}
newTreeEntry(VL, true);
for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) {
ValueList Operands;
// Prepare the operand vector.
for (Value *j : VL) {
CallInst *CI2 = dyn_cast<CallInst>(j);
Operands.push_back(CI2->getArgOperand(i));
}
buildTree_rec(Operands, Depth + 1);
}
return;
}
case Instruction::ShuffleVector: {
// If this is not an alternate sequence of opcode like add-sub
// then do not vectorize this instruction.
if (!isAltShuffle) {
BS.cancelScheduling(VL);
newTreeEntry(VL, false);
DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n");
return;
}
newTreeEntry(VL, true);
DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n");
// Reorder operands if reordering would enable vectorization.
if (isa<BinaryOperator>(VL0)) {
ValueList Left, Right;
reorderAltShuffleOperands(VL, Left, Right);
buildTree_rec(Left, Depth + 1);
buildTree_rec(Right, Depth + 1);
return;
}
for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
ValueList Operands;
// Prepare the operand vector.
for (Value *j : VL)
Operands.push_back(cast<Instruction>(j)->getOperand(i));
buildTree_rec(Operands, Depth + 1);
}
return;
}
default:
BS.cancelScheduling(VL);
newTreeEntry(VL, false);
DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n");
return;
}
}
unsigned BoUpSLP::canMapToVector(Type *T, const DataLayout &DL) const {
unsigned N;
Type *EltTy;
auto *ST = dyn_cast<StructType>(T);
if (ST) {
N = ST->getNumElements();
EltTy = *ST->element_begin();
} else {
N = cast<ArrayType>(T)->getNumElements();
EltTy = cast<ArrayType>(T)->getElementType();
}
if (!isValidElementType(EltTy))
return 0;
uint64_t VTSize = DL.getTypeStoreSizeInBits(VectorType::get(EltTy, N));
if (VTSize < MinVecRegSize || VTSize > MaxVecRegSize || VTSize != DL.getTypeStoreSizeInBits(T))
return 0;
if (ST) {
// Check that struct is homogeneous.
for (const auto *Ty : ST->elements())
if (Ty != EltTy)
return 0;
}
return N;
}
bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, unsigned Opcode) const {
assert(Opcode == Instruction::ExtractElement ||
Opcode == Instruction::ExtractValue);
assert(Opcode == getSameOpcode(VL) && "Invalid opcode");
// Check if all of the extracts come from the same vector and from the
// correct offset.
Value *VL0 = VL[0];
Instruction *E0 = cast<Instruction>(VL0);
Value *Vec = E0->getOperand(0);
// We have to extract from a vector/aggregate with the same number of elements.
unsigned NElts;
if (Opcode == Instruction::ExtractValue) {
const DataLayout &DL = E0->getModule()->getDataLayout();
NElts = canMapToVector(Vec->getType(), DL);
if (!NElts)
return false;
// Check if load can be rewritten as load of vector.
LoadInst *LI = dyn_cast<LoadInst>(Vec);
if (!LI || !LI->isSimple() || !LI->hasNUses(VL.size()))
return false;
} else {
NElts = Vec->getType()->getVectorNumElements();
}
if (NElts != VL.size())
return false;
// Check that all of the indices extract from the correct offset.
if (!matchExtractIndex(E0, 0, Opcode))
return false;
for (unsigned i = 1, e = VL.size(); i < e; ++i) {
Instruction *E = cast<Instruction>(VL[i]);
if (!matchExtractIndex(E, i, Opcode))
return false;
if (E->getOperand(0) != Vec)
return false;
}
return true;
}
int BoUpSLP::getEntryCost(TreeEntry *E) {
ArrayRef<Value*> VL = E->Scalars;
Type *ScalarTy = VL[0]->getType();
if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
ScalarTy = SI->getValueOperand()->getType();
VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
// If we have computed a smaller type for the expression, update VecTy so
// that the costs will be accurate.
if (MinBWs.count(VL[0]))
VecTy = VectorType::get(
IntegerType::get(F->getContext(), MinBWs[VL[0]].first), VL.size());
if (E->NeedToGather) {
if (allConstant(VL))
return 0;
if (isSplat(VL)) {
return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, 0);
}
return getGatherCost(E->Scalars);
}
unsigned Opcode = getSameOpcode(VL);
assert(Opcode && allSameType(VL) && allSameBlock(VL) && "Invalid VL");
Instruction *VL0 = cast<Instruction>(VL[0]);
switch (Opcode) {
case Instruction::PHI: {
return 0;
}
case Instruction::ExtractValue:
case Instruction::ExtractElement: {
if (canReuseExtract(VL, Opcode)) {
int DeadCost = 0;
for (unsigned i = 0, e = VL.size(); i < e; ++i) {
Instruction *E = cast<Instruction>(VL[i]);
if (E->hasOneUse())
// Take credit for instruction that will become dead.
DeadCost +=
TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, i);
}
return -DeadCost;
}
return getGatherCost(VecTy);
}
case Instruction::ZExt:
case Instruction::SExt:
case Instruction::FPToUI:
case Instruction::FPToSI:
case Instruction::FPExt:
case Instruction::PtrToInt:
case Instruction::IntToPtr:
case Instruction::SIToFP:
case Instruction::UIToFP:
case Instruction::Trunc:
case Instruction::FPTrunc:
case Instruction::BitCast: {
Type *SrcTy = VL0->getOperand(0)->getType();
// Calculate the cost of this instruction.
int ScalarCost = VL.size() * TTI->getCastInstrCost(VL0->getOpcode(),
VL0->getType(), SrcTy);
VectorType *SrcVecTy = VectorType::get(SrcTy, VL.size());
int VecCost = TTI->getCastInstrCost(VL0->getOpcode(), VecTy, SrcVecTy);
return VecCost - ScalarCost;
}
case Instruction::FCmp:
case Instruction::ICmp:
case Instruction::Select: {
// Calculate the cost of this instruction.
VectorType *MaskTy = VectorType::get(Builder.getInt1Ty(), VL.size());
int ScalarCost = VecTy->getNumElements() *
TTI->getCmpSelInstrCost(Opcode, ScalarTy, Builder.getInt1Ty());
int VecCost = TTI->getCmpSelInstrCost(Opcode, VecTy, MaskTy);
return VecCost - ScalarCost;
}
case Instruction::Add:
case Instruction::FAdd:
case Instruction::Sub:
case Instruction::FSub:
case Instruction::Mul:
case Instruction::FMul:
case Instruction::UDiv:
case Instruction::SDiv:
case Instruction::FDiv:
case Instruction::URem:
case Instruction::SRem:
case Instruction::FRem:
case Instruction::Shl:
case Instruction::LShr:
case Instruction::AShr:
case Instruction::And:
case Instruction::Or:
case Instruction::Xor: {
// Certain instructions can be cheaper to vectorize if they have a
// constant second vector operand.
TargetTransformInfo::OperandValueKind Op1VK =
TargetTransformInfo::OK_AnyValue;
TargetTransformInfo::OperandValueKind Op2VK =
TargetTransformInfo::OK_UniformConstantValue;
TargetTransformInfo::OperandValueProperties Op1VP =
TargetTransformInfo::OP_None;
TargetTransformInfo::OperandValueProperties Op2VP =
TargetTransformInfo::OP_None;
// If all operands are exactly the same ConstantInt then set the
// operand kind to OK_UniformConstantValue.
// If instead not all operands are constants, then set the operand kind
// to OK_AnyValue. If all operands are constants but not the same,
// then set the operand kind to OK_NonUniformConstantValue.
ConstantInt *CInt = nullptr;
for (unsigned i = 0; i < VL.size(); ++i) {
const Instruction *I = cast<Instruction>(VL[i]);
if (!isa<ConstantInt>(I->getOperand(1))) {
Op2VK = TargetTransformInfo::OK_AnyValue;
break;
}
if (i == 0) {
CInt = cast<ConstantInt>(I->getOperand(1));
continue;
}
if (Op2VK == TargetTransformInfo::OK_UniformConstantValue &&
CInt != cast<ConstantInt>(I->getOperand(1)))
Op2VK = TargetTransformInfo::OK_NonUniformConstantValue;
}
// FIXME: Currently cost of model modification for division by power of
// 2 is handled for X86 and AArch64. Add support for other targets.
if (Op2VK == TargetTransformInfo::OK_UniformConstantValue && CInt &&
CInt->getValue().isPowerOf2())
Op2VP = TargetTransformInfo::OP_PowerOf2;
int ScalarCost = VecTy->getNumElements() *
TTI->getArithmeticInstrCost(Opcode, ScalarTy, Op1VK,
Op2VK, Op1VP, Op2VP);
int VecCost = TTI->getArithmeticInstrCost(Opcode, VecTy, Op1VK, Op2VK,
Op1VP, Op2VP);
return VecCost - ScalarCost;
}
case Instruction::GetElementPtr: {
TargetTransformInfo::OperandValueKind Op1VK =
TargetTransformInfo::OK_AnyValue;
TargetTransformInfo::OperandValueKind Op2VK =
TargetTransformInfo::OK_UniformConstantValue;
int ScalarCost =
VecTy->getNumElements() *
TTI->getArithmeticInstrCost(Instruction::Add, ScalarTy, Op1VK, Op2VK);
int VecCost =
TTI->getArithmeticInstrCost(Instruction::Add, VecTy, Op1VK, Op2VK);
return VecCost - ScalarCost;
}
case Instruction::Load: {
// Cost of wide load - cost of scalar loads.
unsigned alignment = dyn_cast<LoadInst>(VL0)->getAlignment();
int ScalarLdCost = VecTy->getNumElements() *
TTI->getMemoryOpCost(Instruction::Load, ScalarTy, alignment, 0);
int VecLdCost = TTI->getMemoryOpCost(Instruction::Load,
VecTy, alignment, 0);
return VecLdCost - ScalarLdCost;
}
case Instruction::Store: {
// We know that we can merge the stores. Calculate the cost.
unsigned alignment = dyn_cast<StoreInst>(VL0)->getAlignment();
int ScalarStCost = VecTy->getNumElements() *
TTI->getMemoryOpCost(Instruction::Store, ScalarTy, alignment, 0);
int VecStCost = TTI->getMemoryOpCost(Instruction::Store,
VecTy, alignment, 0);
return VecStCost - ScalarStCost;
}
case Instruction::Call: {
CallInst *CI = cast<CallInst>(VL0);
Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
// Calculate the cost of the scalar and vector calls.
SmallVector<Type*, 4> ScalarTys, VecTys;
for (unsigned op = 0, opc = CI->getNumArgOperands(); op!= opc; ++op) {
ScalarTys.push_back(CI->getArgOperand(op)->getType());
VecTys.push_back(VectorType::get(CI->getArgOperand(op)->getType(),
VecTy->getNumElements()));
}
FastMathFlags FMF;
if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
FMF = FPMO->getFastMathFlags();
int ScalarCallCost = VecTy->getNumElements() *
TTI->getIntrinsicInstrCost(ID, ScalarTy, ScalarTys, FMF);
int VecCallCost = TTI->getIntrinsicInstrCost(ID, VecTy, VecTys, FMF);
DEBUG(dbgs() << "SLP: Call cost "<< VecCallCost - ScalarCallCost
<< " (" << VecCallCost << "-" << ScalarCallCost << ")"
<< " for " << *CI << "\n");
return VecCallCost - ScalarCallCost;
}
case Instruction::ShuffleVector: {
TargetTransformInfo::OperandValueKind Op1VK =
TargetTransformInfo::OK_AnyValue;
TargetTransformInfo::OperandValueKind Op2VK =
TargetTransformInfo::OK_AnyValue;
int ScalarCost = 0;
int VecCost = 0;
for (Value *i : VL) {
Instruction *I = cast<Instruction>(i);
if (!I)
break;
ScalarCost +=
TTI->getArithmeticInstrCost(I->getOpcode(), ScalarTy, Op1VK, Op2VK);
}
// VecCost is equal to sum of the cost of creating 2 vectors
// and the cost of creating shuffle.
Instruction *I0 = cast<Instruction>(VL[0]);
VecCost =
TTI->getArithmeticInstrCost(I0->getOpcode(), VecTy, Op1VK, Op2VK);
Instruction *I1 = cast<Instruction>(VL[1]);
VecCost +=
TTI->getArithmeticInstrCost(I1->getOpcode(), VecTy, Op1VK, Op2VK);
VecCost +=
TTI->getShuffleCost(TargetTransformInfo::SK_Alternate, VecTy, 0);
return VecCost - ScalarCost;
}
default:
llvm_unreachable("Unknown instruction");
}
}
bool BoUpSLP::isFullyVectorizableTinyTree() {
DEBUG(dbgs() << "SLP: Check whether the tree with height " <<
VectorizableTree.size() << " is fully vectorizable .\n");
// We only handle trees of heights 1 and 2.
if (VectorizableTree.size() == 1 && !VectorizableTree[0].NeedToGather)
return true;
if (VectorizableTree.size() != 2)
return false;
// Handle splat and all-constants stores.
if (!VectorizableTree[0].NeedToGather &&
(allConstant(VectorizableTree[1].Scalars) ||
isSplat(VectorizableTree[1].Scalars)))
return true;
// Gathering cost would be too much for tiny trees.
if (VectorizableTree[0].NeedToGather || VectorizableTree[1].NeedToGather)
return false;
return true;
}
bool BoUpSLP::isTreeTinyAndNotFullyVectorizable() {
// We can vectorize the tree if its size is greater than or equal to the
// minimum size specified by the MinTreeSize command line option.
if (VectorizableTree.size() >= MinTreeSize)
return false;
// If we have a tiny tree (a tree whose size is less than MinTreeSize), we
// can vectorize it if we can prove it fully vectorizable.
if (isFullyVectorizableTinyTree())
return false;
assert(VectorizableTree.empty()
? ExternalUses.empty()
: true && "We shouldn't have any external users");
// Otherwise, we can't vectorize the tree. It is both tiny and not fully
// vectorizable.
return true;
}
int BoUpSLP::getSpillCost() {
// Walk from the bottom of the tree to the top, tracking which values are
// live. When we see a call instruction that is not part of our tree,
// query TTI to see if there is a cost to keeping values live over it
// (for example, if spills and fills are required).
unsigned BundleWidth = VectorizableTree.front().Scalars.size();
int Cost = 0;
SmallPtrSet<Instruction*, 4> LiveValues;
Instruction *PrevInst = nullptr;
for (const auto &N : VectorizableTree) {
Instruction *Inst = dyn_cast<Instruction>(N.Scalars[0]);
if (!Inst)
continue;
if (!PrevInst) {
PrevInst = Inst;
continue;
}
// Update LiveValues.
LiveValues.erase(PrevInst);
for (auto &J : PrevInst->operands()) {
if (isa<Instruction>(&*J) && ScalarToTreeEntry.count(&*J))
LiveValues.insert(cast<Instruction>(&*J));
}
DEBUG(
dbgs() << "SLP: #LV: " << LiveValues.size();
for (auto *X : LiveValues)
dbgs() << " " << X->getName();
dbgs() << ", Looking at ";
Inst->dump();
);
// Now find the sequence of instructions between PrevInst and Inst.
BasicBlock::reverse_iterator InstIt = ++Inst->getIterator().getReverse(),
PrevInstIt =
PrevInst->getIterator().getReverse();
while (InstIt != PrevInstIt) {
if (PrevInstIt == PrevInst->getParent()->rend()) {
PrevInstIt = Inst->getParent()->rbegin();
continue;
}
if (isa<CallInst>(&*PrevInstIt) && &*PrevInstIt != PrevInst) {
SmallVector<Type*, 4> V;
for (auto *II : LiveValues)
V.push_back(VectorType::get(II->getType(), BundleWidth));
Cost += TTI->getCostOfKeepingLiveOverCall(V);
}
++PrevInstIt;
}
PrevInst = Inst;
}
return Cost;
}
int BoUpSLP::getTreeCost() {
int Cost = 0;
DEBUG(dbgs() << "SLP: Calculating cost for tree of size " <<
VectorizableTree.size() << ".\n");
unsigned BundleWidth = VectorizableTree[0].Scalars.size();
for (TreeEntry &TE : VectorizableTree) {
int C = getEntryCost(&TE);
DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle that starts with "
<< *TE.Scalars[0] << ".\n");
Cost += C;
}
SmallSet<Value *, 16> ExtractCostCalculated;
int ExtractCost = 0;
for (ExternalUser &EU : ExternalUses) {
// We only add extract cost once for the same scalar.
if (!ExtractCostCalculated.insert(EU.Scalar).second)
continue;
// Uses by ephemeral values are free (because the ephemeral value will be
// removed prior to code generation, and so the extraction will be
// removed as well).
if (EphValues.count(EU.User))
continue;
// If we plan to rewrite the tree in a smaller type, we will need to sign
// extend the extracted value back to the original type. Here, we account
// for the extract and the added cost of the sign extend if needed.
auto *VecTy = VectorType::get(EU.Scalar->getType(), BundleWidth);
auto *ScalarRoot = VectorizableTree[0].Scalars[0];
if (MinBWs.count(ScalarRoot)) {
auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot].first);
auto Extend =
MinBWs[ScalarRoot].second ? Instruction::SExt : Instruction::ZExt;
VecTy = VectorType::get(MinTy, BundleWidth);
ExtractCost += TTI->getExtractWithExtendCost(Extend, EU.Scalar->getType(),
VecTy, EU.Lane);
} else {
ExtractCost +=
TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, EU.Lane);
}
}
int SpillCost = getSpillCost();
Cost += SpillCost + ExtractCost;
DEBUG(dbgs() << "SLP: Spill Cost = " << SpillCost << ".\n"
<< "SLP: Extract Cost = " << ExtractCost << ".\n"
<< "SLP: Total Cost = " << Cost << ".\n");
return Cost;
}
int BoUpSLP::getGatherCost(Type *Ty) {
int Cost = 0;
for (unsigned i = 0, e = cast<VectorType>(Ty)->getNumElements(); i < e; ++i)
Cost += TTI->getVectorInstrCost(Instruction::InsertElement, Ty, i);
return Cost;
}
int BoUpSLP::getGatherCost(ArrayRef<Value *> VL) {
// Find the type of the operands in VL.
Type *ScalarTy = VL[0]->getType();
if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
ScalarTy = SI->getValueOperand()->getType();
VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
// Find the cost of inserting/extracting values from the vector.
return getGatherCost(VecTy);
}
// Reorder commutative operations in alternate shuffle if the resulting vectors
// are consecutive loads. This would allow us to vectorize the tree.
// If we have something like-
// load a[0] - load b[0]
// load b[1] + load a[1]
// load a[2] - load b[2]
// load a[3] + load b[3]
// Reordering the second load b[1] load a[1] would allow us to vectorize this
// code.
void BoUpSLP::reorderAltShuffleOperands(ArrayRef<Value *> VL,
SmallVectorImpl<Value *> &Left,
SmallVectorImpl<Value *> &Right) {
// Push left and right operands of binary operation into Left and Right
for (Value *i : VL) {
Left.push_back(cast<Instruction>(i)->getOperand(0));
Right.push_back(cast<Instruction>(i)->getOperand(1));
}
// Reorder if we have a commutative operation and consecutive access
// are on either side of the alternate instructions.
for (unsigned j = 0; j < VL.size() - 1; ++j) {
if (LoadInst *L = dyn_cast<LoadInst>(Left[j])) {
if (LoadInst *L1 = dyn_cast<LoadInst>(Right[j + 1])) {
Instruction *VL1 = cast<Instruction>(VL[j]);
Instruction *VL2 = cast<Instruction>(VL[j + 1]);
if (VL1->isCommutative() && isConsecutiveAccess(L, L1, *DL, *SE)) {
std::swap(Left[j], Right[j]);
continue;
} else if (VL2->isCommutative() &&
isConsecutiveAccess(L, L1, *DL, *SE)) {
std::swap(Left[j + 1], Right[j + 1]);
continue;
}
// else unchanged
}
}
if (LoadInst *L = dyn_cast<LoadInst>(Right[j])) {
if (LoadInst *L1 = dyn_cast<LoadInst>(Left[j + 1])) {
Instruction *VL1 = cast<Instruction>(VL[j]);
Instruction *VL2 = cast<Instruction>(VL[j + 1]);
if (VL1->isCommutative() && isConsecutiveAccess(L, L1, *DL, *SE)) {
std::swap(Left[j], Right[j]);
continue;
} else if (VL2->isCommutative() &&
isConsecutiveAccess(L, L1, *DL, *SE)) {
std::swap(Left[j + 1], Right[j + 1]);
continue;
}
// else unchanged
}
}
}
}
// Return true if I should be commuted before adding it's left and right
// operands to the arrays Left and Right.
//
// The vectorizer is trying to either have all elements one side being
// instruction with the same opcode to enable further vectorization, or having
// a splat to lower the vectorizing cost.
static bool shouldReorderOperands(int i, Instruction &I,
SmallVectorImpl<Value *> &Left,
SmallVectorImpl<Value *> &Right,
bool AllSameOpcodeLeft,
bool AllSameOpcodeRight, bool SplatLeft,
bool SplatRight) {
Value *VLeft = I.getOperand(0);
Value *VRight = I.getOperand(1);
// If we have "SplatRight", try to see if commuting is needed to preserve it.
if (SplatRight) {
if (VRight == Right[i - 1])
// Preserve SplatRight
return false;
if (VLeft == Right[i - 1]) {
// Commuting would preserve SplatRight, but we don't want to break
// SplatLeft either, i.e. preserve the original order if possible.
// (FIXME: why do we care?)
if (SplatLeft && VLeft == Left[i - 1])
return false;
return true;
}
}
// Symmetrically handle Right side.
if (SplatLeft) {
if (VLeft == Left[i - 1])
// Preserve SplatLeft
return false;
if (VRight == Left[i - 1])
return true;
}
Instruction *ILeft = dyn_cast<Instruction>(VLeft);
Instruction *IRight = dyn_cast<Instruction>(VRight);
// If we have "AllSameOpcodeRight", try to see if the left operands preserves
// it and not the right, in this case we want to commute.
if (AllSameOpcodeRight) {
unsigned RightPrevOpcode = cast<Instruction>(Right[i - 1])->getOpcode();
if (IRight && RightPrevOpcode == IRight->getOpcode())
// Do not commute, a match on the right preserves AllSameOpcodeRight
return false;
if (ILeft && RightPrevOpcode == ILeft->getOpcode()) {
// We have a match and may want to commute, but first check if there is
// not also a match on the existing operands on the Left to preserve
// AllSameOpcodeLeft, i.e. preserve the original order if possible.
// (FIXME: why do we care?)
if (AllSameOpcodeLeft && ILeft &&
cast<Instruction>(Left[i - 1])->getOpcode() == ILeft->getOpcode())
return false;
return true;
}
}
// Symmetrically handle Left side.
if (AllSameOpcodeLeft) {
unsigned LeftPrevOpcode = cast<Instruction>(Left[i - 1])->getOpcode();
if (ILeft && LeftPrevOpcode == ILeft->getOpcode())
return false;
if (IRight && LeftPrevOpcode == IRight->getOpcode())
return true;
}
return false;
}
void BoUpSLP::reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
SmallVectorImpl<Value *> &Left,
SmallVectorImpl<Value *> &Right) {
if (VL.size()) {
// Peel the first iteration out of the loop since there's nothing
// interesting to do anyway and it simplifies the checks in the loop.
auto VLeft = cast<Instruction>(VL[0])->getOperand(0);
auto VRight = cast<Instruction>(VL[0])->getOperand(1);
if (!isa<Instruction>(VRight) && isa<Instruction>(VLeft))
// Favor having instruction to the right. FIXME: why?
std::swap(VLeft, VRight);
Left.push_back(VLeft);
Right.push_back(VRight);
}
// Keep track if we have instructions with all the same opcode on one side.
bool AllSameOpcodeLeft = isa<Instruction>(Left[0]);
bool AllSameOpcodeRight = isa<Instruction>(Right[0]);
// Keep track if we have one side with all the same value (broadcast).
bool SplatLeft = true;
bool SplatRight = true;
for (unsigned i = 1, e = VL.size(); i != e; ++i) {
Instruction *I = cast<Instruction>(VL[i]);
assert(I->isCommutative() && "Can only process commutative instruction");
// Commute to favor either a splat or maximizing having the same opcodes on
// one side.
if (shouldReorderOperands(i, *I, Left, Right, AllSameOpcodeLeft,
AllSameOpcodeRight, SplatLeft, SplatRight)) {
Left.push_back(I->getOperand(1));
Right.push_back(I->getOperand(0));
} else {
Left.push_back(I->getOperand(0));
Right.push_back(I->getOperand(1));
}
// Update Splat* and AllSameOpcode* after the insertion.
SplatRight = SplatRight && (Right[i - 1] == Right[i]);
SplatLeft = SplatLeft && (Left[i - 1] == Left[i]);
AllSameOpcodeLeft = AllSameOpcodeLeft && isa<Instruction>(Left[i]) &&
(cast<Instruction>(Left[i - 1])->getOpcode() ==
cast<Instruction>(Left[i])->getOpcode());
AllSameOpcodeRight = AllSameOpcodeRight && isa<Instruction>(Right[i]) &&
(cast<Instruction>(Right[i - 1])->getOpcode() ==
cast<Instruction>(Right[i])->getOpcode());
}
// If one operand end up being broadcast, return this operand order.
if (SplatRight || SplatLeft)
return;
// Finally check if we can get longer vectorizable chain by reordering
// without breaking the good operand order detected above.
// E.g. If we have something like-
// load a[0] load b[0]
// load b[1] load a[1]
// load a[2] load b[2]
// load a[3] load b[3]
// Reordering the second load b[1] load a[1] would allow us to vectorize
// this code and we still retain AllSameOpcode property.
// FIXME: This load reordering might break AllSameOpcode in some rare cases
// such as-
// add a[0],c[0] load b[0]
// add a[1],c[2] load b[1]
// b[2] load b[2]
// add a[3],c[3] load b[3]
for (unsigned j = 0; j < VL.size() - 1; ++j) {
if (LoadInst *L = dyn_cast<LoadInst>(Left[j])) {
if (LoadInst *L1 = dyn_cast<LoadInst>(Right[j + 1])) {
if (isConsecutiveAccess(L, L1, *DL, *SE)) {
std::swap(Left[j + 1], Right[j + 1]);
continue;
}
}
}
if (LoadInst *L = dyn_cast<LoadInst>(Right[j])) {
if (LoadInst *L1 = dyn_cast<LoadInst>(Left[j + 1])) {
if (isConsecutiveAccess(L, L1, *DL, *SE)) {
std::swap(Left[j + 1], Right[j + 1]);
continue;
}
}
}
// else unchanged
}
}
void BoUpSLP::setInsertPointAfterBundle(ArrayRef<Value *> VL) {
// Get the basic block this bundle is in. All instructions in the bundle
// should be in this block.
auto *Front = cast<Instruction>(VL.front());
auto *BB = Front->getParent();
assert(all_of(make_range(VL.begin(), VL.end()), [&](Value *V) -> bool {
return cast<Instruction>(V)->getParent() == BB;
}));
// The last instruction in the bundle in program order.
Instruction *LastInst = nullptr;
// Find the last instruction. The common case should be that BB has been
// scheduled, and the last instruction is VL.back(). So we start with
// VL.back() and iterate over schedule data until we reach the end of the
// bundle. The end of the bundle is marked by null ScheduleData.
if (BlocksSchedules.count(BB)) {
auto *Bundle = BlocksSchedules[BB]->getScheduleData(VL.back());
if (Bundle && Bundle->isPartOfBundle())
for (; Bundle; Bundle = Bundle->NextInBundle)
LastInst = Bundle->Inst;
}
// LastInst can still be null at this point if there's either not an entry
// for BB in BlocksSchedules or there's no ScheduleData available for
// VL.back(). This can be the case if buildTree_rec aborts for various
// reasons (e.g., the maximum recursion depth is reached, the maximum region
// size is reached, etc.). ScheduleData is initialized in the scheduling
// "dry-run".
//
// If this happens, we can still find the last instruction by brute force. We
// iterate forwards from Front (inclusive) until we either see all
// instructions in the bundle or reach the end of the block. If Front is the
// last instruction in program order, LastInst will be set to Front, and we
// will visit all the remaining instructions in the block.
//
// One of the reasons we exit early from buildTree_rec is to place an upper
// bound on compile-time. Thus, taking an additional compile-time hit here is
// not ideal. However, this should be exceedingly rare since it requires that
// we both exit early from buildTree_rec and that the bundle be out-of-order
// (causing us to iterate all the way to the end of the block).
if (!LastInst) {
SmallPtrSet<Value *, 16> Bundle(VL.begin(), VL.end());
for (auto &I : make_range(BasicBlock::iterator(Front), BB->end())) {
if (Bundle.erase(&I))
LastInst = &I;
if (Bundle.empty())
break;
}
}
// Set the insertion point after the last instruction in the bundle. Set the
// debug location to Front.
Builder.SetInsertPoint(BB, ++LastInst->getIterator());
Builder.SetCurrentDebugLocation(Front->getDebugLoc());
}
Value *BoUpSLP::Gather(ArrayRef<Value *> VL, VectorType *Ty) {
Value *Vec = UndefValue::get(Ty);
// Generate the 'InsertElement' instruction.
for (unsigned i = 0; i < Ty->getNumElements(); ++i) {
Vec = Builder.CreateInsertElement(Vec, VL[i], Builder.getInt32(i));
if (Instruction *Insrt = dyn_cast<Instruction>(Vec)) {
GatherSeq.insert(Insrt);
CSEBlocks.insert(Insrt->getParent());
// Add to our 'need-to-extract' list.
if (ScalarToTreeEntry.count(VL[i])) {
int Idx = ScalarToTreeEntry[VL[i]];
TreeEntry *E = &VectorizableTree[Idx];
// Find which lane we need to extract.
int FoundLane = -1;
for (unsigned Lane = 0, LE = VL.size(); Lane != LE; ++Lane) {
// Is this the lane of the scalar that we are looking for ?
if (E->Scalars[Lane] == VL[i]) {
FoundLane = Lane;
break;
}
}
assert(FoundLane >= 0 && "Could not find the correct lane");
ExternalUses.push_back(ExternalUser(VL[i], Insrt, FoundLane));
}
}
}
return Vec;
}
Value *BoUpSLP::alreadyVectorized(ArrayRef<Value *> VL) const {
SmallDenseMap<Value*, int>::const_iterator Entry
= ScalarToTreeEntry.find(VL[0]);
if (Entry != ScalarToTreeEntry.end()) {
int Idx = Entry->second;
const TreeEntry *En = &VectorizableTree[Idx];
if (En->isSame(VL) && En->VectorizedValue)
return En->VectorizedValue;
}
return nullptr;
}
Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) {
if (ScalarToTreeEntry.count(VL[0])) {
int Idx = ScalarToTreeEntry[VL[0]];
TreeEntry *E = &VectorizableTree[Idx];
if (E->isSame(VL))
return vectorizeTree(E);
}
Type *ScalarTy = VL[0]->getType();
if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
ScalarTy = SI->getValueOperand()->getType();
VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
return Gather(VL, VecTy);
}
Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
IRBuilder<>::InsertPointGuard Guard(Builder);
if (E->VectorizedValue) {
DEBUG(dbgs() << "SLP: Diamond merged for " << *E->Scalars[0] << ".\n");
return E->VectorizedValue;
}
Instruction *VL0 = cast<Instruction>(E->Scalars[0]);
Type *ScalarTy = VL0->getType();
if (StoreInst *SI = dyn_cast<StoreInst>(VL0))
ScalarTy = SI->getValueOperand()->getType();
VectorType *VecTy = VectorType::get(ScalarTy, E->Scalars.size());
if (E->NeedToGather) {
setInsertPointAfterBundle(E->Scalars);
auto *V = Gather(E->Scalars, VecTy);
E->VectorizedValue = V;
return V;
}
unsigned Opcode = getSameOpcode(E->Scalars);
switch (Opcode) {
case Instruction::PHI: {
PHINode *PH = dyn_cast<PHINode>(VL0);
Builder.SetInsertPoint(PH->getParent()->getFirstNonPHI());
Builder.SetCurrentDebugLocation(PH->getDebugLoc());
PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues());
E->VectorizedValue = NewPhi;
// PHINodes may have multiple entries from the same block. We want to
// visit every block once.
SmallSet<BasicBlock*, 4> VisitedBBs;
for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
ValueList Operands;
BasicBlock *IBB = PH->getIncomingBlock(i);
if (!VisitedBBs.insert(IBB).second) {
NewPhi->addIncoming(NewPhi->getIncomingValueForBlock(IBB), IBB);
continue;
}
// Prepare the operand vector.
for (Value *V : E->Scalars)
Operands.push_back(cast<PHINode>(V)->getIncomingValueForBlock(IBB));
Builder.SetInsertPoint(IBB->getTerminator());
Builder.SetCurrentDebugLocation(PH->getDebugLoc());
Value *Vec = vectorizeTree(Operands);
NewPhi->addIncoming(Vec, IBB);
}
assert(NewPhi->getNumIncomingValues() == PH->getNumIncomingValues() &&
"Invalid number of incoming values");
return NewPhi;
}
case Instruction::ExtractElement: {
if (canReuseExtract(E->Scalars, Instruction::ExtractElement)) {
Value *V = VL0->getOperand(0);
E->VectorizedValue = V;
return V;
}
setInsertPointAfterBundle(E->Scalars);
auto *V = Gather(E->Scalars, VecTy);
E->VectorizedValue = V;
return V;
}
case Instruction::ExtractValue: {
if (canReuseExtract(E->Scalars, Instruction::ExtractValue)) {
LoadInst *LI = cast<LoadInst>(VL0->getOperand(0));
Builder.SetInsertPoint(LI);
PointerType *PtrTy = PointerType::get(VecTy, LI->getPointerAddressSpace());
Value *Ptr = Builder.CreateBitCast(LI->getOperand(0), PtrTy);
LoadInst *V = Builder.CreateAlignedLoad(Ptr, LI->getAlignment());
E->VectorizedValue = V;
return propagateMetadata(V, E->Scalars);
}
setInsertPointAfterBundle(E->Scalars);
auto *V = Gather(E->Scalars, VecTy);
E->VectorizedValue = V;
return V;
}
case Instruction::ZExt:
case Instruction::SExt:
case Instruction::FPToUI:
case Instruction::FPToSI:
case Instruction::FPExt:
case Instruction::PtrToInt:
case Instruction::IntToPtr:
case Instruction::SIToFP:
case Instruction::UIToFP:
case Instruction::Trunc:
case Instruction::FPTrunc:
case Instruction::BitCast: {
ValueList INVL;
for (Value *V : E->Scalars)
INVL.push_back(cast<Instruction>(V)->getOperand(0));
setInsertPointAfterBundle(E->Scalars);
Value *InVec = vectorizeTree(INVL);
if (Value *V = alreadyVectorized(E->Scalars))
return V;
CastInst *CI = dyn_cast<CastInst>(VL0);
Value *V = Builder.CreateCast(CI->getOpcode(), InVec, VecTy);
E->VectorizedValue = V;
++NumVectorInstructions;
return V;
}
case Instruction::FCmp:
case Instruction::ICmp: {
ValueList LHSV, RHSV;
for (Value *V : E->Scalars) {
LHSV.push_back(cast<Instruction>(V)->getOperand(0));
RHSV.push_back(cast<Instruction>(V)->getOperand(1));
}
setInsertPointAfterBundle(E->Scalars);
Value *L = vectorizeTree(LHSV);
Value *R = vectorizeTree(RHSV);
if (Value *V = alreadyVectorized(E->Scalars))
return V;
CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
Value *V;
if (Opcode == Instruction::FCmp)
V = Builder.CreateFCmp(P0, L, R);
else
V = Builder.CreateICmp(P0, L, R);
E->VectorizedValue = V;
propagateIRFlags(E->VectorizedValue, E->Scalars);
++NumVectorInstructions;
return V;
}
case Instruction::Select: {
ValueList TrueVec, FalseVec, CondVec;
for (Value *V : E->Scalars) {
CondVec.push_back(cast<Instruction>(V)->getOperand(0));
TrueVec.push_back(cast<Instruction>(V)->getOperand(1));
FalseVec.push_back(cast<Instruction>(V)->getOperand(2));
}
setInsertPointAfterBundle(E->Scalars);
Value *Cond = vectorizeTree(CondVec);
Value *True = vectorizeTree(TrueVec);
Value *False = vectorizeTree(FalseVec);
if (Value *V = alreadyVectorized(E->Scalars))
return V;
Value *V = Builder.CreateSelect(Cond, True, False);
E->VectorizedValue = V;
++NumVectorInstructions;
return V;
}
case Instruction::Add:
case Instruction::FAdd:
case Instruction::Sub:
case Instruction::FSub:
case Instruction::Mul:
case Instruction::FMul:
case Instruction::UDiv:
case Instruction::SDiv:
case Instruction::FDiv:
case Instruction::URem:
case Instruction::SRem:
case Instruction::FRem:
case Instruction::Shl:
case Instruction::LShr:
case Instruction::AShr:
case Instruction::And:
case Instruction::Or:
case Instruction::Xor: {
ValueList LHSVL, RHSVL;
if (isa<BinaryOperator>(VL0) && VL0->isCommutative())
reorderInputsAccordingToOpcode(E->Scalars, LHSVL, RHSVL);
else
for (Value *V : E->Scalars) {
LHSVL.push_back(cast<Instruction>(V)->getOperand(0));
RHSVL.push_back(cast<Instruction>(V)->getOperand(1));
}
setInsertPointAfterBundle(E->Scalars);
Value *LHS = vectorizeTree(LHSVL);
Value *RHS = vectorizeTree(RHSVL);
if (Value *V = alreadyVectorized(E->Scalars))
return V;
BinaryOperator *BinOp = cast<BinaryOperator>(VL0);
Value *V = Builder.CreateBinOp(BinOp->getOpcode(), LHS, RHS);
E->VectorizedValue = V;
propagateIRFlags(E->VectorizedValue, E->Scalars);
++NumVectorInstructions;
if (Instruction *I = dyn_cast<Instruction>(V))
return propagateMetadata(I, E->Scalars);
return V;
}
case Instruction::Load: {
// Loads are inserted at the head of the tree because we don't want to
// sink them all the way down past store instructions.
setInsertPointAfterBundle(E->Scalars);
LoadInst *LI = cast<LoadInst>(VL0);
Type *ScalarLoadTy = LI->getType();
unsigned AS = LI->getPointerAddressSpace();
Value *VecPtr = Builder.CreateBitCast(LI->getPointerOperand(),
VecTy->getPointerTo(AS));
// The pointer operand uses an in-tree scalar so we add the new BitCast to
// ExternalUses list to make sure that an extract will be generated in the
// future.
if (ScalarToTreeEntry.count(LI->getPointerOperand()))
ExternalUses.push_back(
ExternalUser(LI->getPointerOperand(), cast<User>(VecPtr), 0));
unsigned Alignment = LI->getAlignment();
LI = Builder.CreateLoad(VecPtr);
if (!Alignment) {
Alignment = DL->getABITypeAlignment(ScalarLoadTy);
}
LI->setAlignment(Alignment);
E->VectorizedValue = LI;
++NumVectorInstructions;
return propagateMetadata(LI, E->Scalars);
}
case Instruction::Store: {
StoreInst *SI = cast<StoreInst>(VL0);
unsigned Alignment = SI->getAlignment();
unsigned AS = SI->getPointerAddressSpace();
ValueList ValueOp;
for (Value *V : E->Scalars)
ValueOp.push_back(cast<StoreInst>(V)->getValueOperand());
setInsertPointAfterBundle(E->Scalars);
Value *VecValue = vectorizeTree(ValueOp);
Value *VecPtr = Builder.CreateBitCast(SI->getPointerOperand(),
VecTy->getPointerTo(AS));
StoreInst *S = Builder.CreateStore(VecValue, VecPtr);
// The pointer operand uses an in-tree scalar so we add the new BitCast to
// ExternalUses list to make sure that an extract will be generated in the
// future.
if (ScalarToTreeEntry.count(SI->getPointerOperand()))
ExternalUses.push_back(
ExternalUser(SI->getPointerOperand(), cast<User>(VecPtr), 0));
if (!Alignment) {
Alignment = DL->getABITypeAlignment(SI->getValueOperand()->getType());
}
S->setAlignment(Alignment);
E->VectorizedValue = S;
++NumVectorInstructions;
return propagateMetadata(S, E->Scalars);
}
case Instruction::GetElementPtr: {
setInsertPointAfterBundle(E->Scalars);
ValueList Op0VL;
for (Value *V : E->Scalars)
Op0VL.push_back(cast<GetElementPtrInst>(V)->getOperand(0));
Value *Op0 = vectorizeTree(Op0VL);
std::vector<Value *> OpVecs;
for (int j = 1, e = cast<GetElementPtrInst>(VL0)->getNumOperands(); j < e;
++j) {
ValueList OpVL;
for (Value *V : E->Scalars)
OpVL.push_back(cast<GetElementPtrInst>(V)->getOperand(j));
Value *OpVec = vectorizeTree(OpVL);
OpVecs.push_back(OpVec);
}
Value *V = Builder.CreateGEP(
cast<GetElementPtrInst>(VL0)->getSourceElementType(), Op0, OpVecs);
E->VectorizedValue = V;
++NumVectorInstructions;
if (Instruction *I = dyn_cast<Instruction>(V))
return propagateMetadata(I, E->Scalars);
return V;
}
case Instruction::Call: {
CallInst *CI = cast<CallInst>(VL0);
setInsertPointAfterBundle(E->Scalars);
Function *FI;
Intrinsic::ID IID = Intrinsic::not_intrinsic;
Value *ScalarArg = nullptr;
if (CI && (FI = CI->getCalledFunction())) {
IID = FI->getIntrinsicID();
}
std::vector<Value *> OpVecs;
for (int j = 0, e = CI->getNumArgOperands(); j < e; ++j) {
ValueList OpVL;
// ctlz,cttz and powi are special intrinsics whose second argument is
// a scalar. This argument should not be vectorized.
if (hasVectorInstrinsicScalarOpd(IID, 1) && j == 1) {
CallInst *CEI = cast<CallInst>(E->Scalars[0]);
ScalarArg = CEI->getArgOperand(j);
OpVecs.push_back(CEI->getArgOperand(j));
continue;
}
for (Value *V : E->Scalars) {
CallInst *CEI = cast<CallInst>(V);
OpVL.push_back(CEI->getArgOperand(j));
}
Value *OpVec = vectorizeTree(OpVL);
DEBUG(dbgs() << "SLP: OpVec[" << j << "]: " << *OpVec << "\n");
OpVecs.push_back(OpVec);
}
Module *M = F->getParent();
Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
Type *Tys[] = { VectorType::get(CI->getType(), E->Scalars.size()) };
Function *CF = Intrinsic::getDeclaration(M, ID, Tys);
SmallVector<OperandBundleDef, 1> OpBundles;
CI->getOperandBundlesAsDefs(OpBundles);
Value *V = Builder.CreateCall(CF, OpVecs, OpBundles);
// The scalar argument uses an in-tree scalar so we add the new vectorized
// call to ExternalUses list to make sure that an extract will be
// generated in the future.
if (ScalarArg && ScalarToTreeEntry.count(ScalarArg))
ExternalUses.push_back(ExternalUser(ScalarArg, cast<User>(V), 0));
E->VectorizedValue = V;
propagateIRFlags(E->VectorizedValue, E->Scalars);
++NumVectorInstructions;
return V;
}
case Instruction::ShuffleVector: {
ValueList LHSVL, RHSVL;
assert(isa<BinaryOperator>(VL0) && "Invalid Shuffle Vector Operand");
reorderAltShuffleOperands(E->Scalars, LHSVL, RHSVL);
setInsertPointAfterBundle(E->Scalars);
Value *LHS = vectorizeTree(LHSVL);
Value *RHS = vectorizeTree(RHSVL);
if (Value *V = alreadyVectorized(E->Scalars))
return V;
// Create a vector of LHS op1 RHS
BinaryOperator *BinOp0 = cast<BinaryOperator>(VL0);
Value *V0 = Builder.CreateBinOp(BinOp0->getOpcode(), LHS, RHS);
// Create a vector of LHS op2 RHS
Instruction *VL1 = cast<Instruction>(E->Scalars[1]);
BinaryOperator *BinOp1 = cast<BinaryOperator>(VL1);
Value *V1 = Builder.CreateBinOp(BinOp1->getOpcode(), LHS, RHS);
// Create shuffle to take alternate operations from the vector.
// Also, gather up odd and even scalar ops to propagate IR flags to
// each vector operation.
ValueList OddScalars, EvenScalars;
unsigned e = E->Scalars.size();
SmallVector<Constant *, 8> Mask(e);
for (unsigned i = 0; i < e; ++i) {
if (i & 1) {
Mask[i] = Builder.getInt32(e + i);
OddScalars.push_back(E->Scalars[i]);
} else {
Mask[i] = Builder.getInt32(i);
EvenScalars.push_back(E->Scalars[i]);
}
}
Value *ShuffleMask = ConstantVector::get(Mask);
propagateIRFlags(V0, EvenScalars);
propagateIRFlags(V1, OddScalars);
Value *V = Builder.CreateShuffleVector(V0, V1, ShuffleMask);
E->VectorizedValue = V;
++NumVectorInstructions;
if (Instruction *I = dyn_cast<Instruction>(V))
return propagateMetadata(I, E->Scalars);
return V;
}
default:
llvm_unreachable("unknown inst");
}
return nullptr;
}
Value *BoUpSLP::vectorizeTree() {
// All blocks must be scheduled before any instructions are inserted.
for (auto &BSIter : BlocksSchedules) {
scheduleBlock(BSIter.second.get());
}
Builder.SetInsertPoint(&F->getEntryBlock().front());
auto *VectorRoot = vectorizeTree(&VectorizableTree[0]);
// If the vectorized tree can be rewritten in a smaller type, we truncate the
// vectorized root. InstCombine will then rewrite the entire expression. We
// sign extend the extracted values below.
auto *ScalarRoot = VectorizableTree[0].Scalars[0];
if (MinBWs.count(ScalarRoot)) {
if (auto *I = dyn_cast<Instruction>(VectorRoot))
Builder.SetInsertPoint(&*++BasicBlock::iterator(I));
auto BundleWidth = VectorizableTree[0].Scalars.size();
auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot].first);
auto *VecTy = VectorType::get(MinTy, BundleWidth);
auto *Trunc = Builder.CreateTrunc(VectorRoot, VecTy);
VectorizableTree[0].VectorizedValue = Trunc;
}
DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size() << " values .\n");
// If necessary, sign-extend or zero-extend ScalarRoot to the larger type
// specified by ScalarType.
auto extend = [&](Value *ScalarRoot, Value *Ex, Type *ScalarType) {
if (!MinBWs.count(ScalarRoot))
return Ex;
if (MinBWs[ScalarRoot].second)
return Builder.CreateSExt(Ex, ScalarType);
return Builder.CreateZExt(Ex, ScalarType);
};
// Extract all of the elements with the external uses.
for (const auto &ExternalUse : ExternalUses) {
Value *Scalar = ExternalUse.Scalar;
llvm::User *User = ExternalUse.User;
// Skip users that we already RAUW. This happens when one instruction
// has multiple uses of the same value.
if (!is_contained(Scalar->users(), User))
continue;
assert(ScalarToTreeEntry.count(Scalar) && "Invalid scalar");
int Idx = ScalarToTreeEntry[Scalar];
TreeEntry *E = &VectorizableTree[Idx];
assert(!E->NeedToGather && "Extracting from a gather list");
Value *Vec = E->VectorizedValue;
assert(Vec && "Can't find vectorizable value");
Value *Lane = Builder.getInt32(ExternalUse.Lane);
// Generate extracts for out-of-tree users.
// Find the insertion point for the extractelement lane.
if (auto *VecI = dyn_cast<Instruction>(Vec)) {
if (PHINode *PH = dyn_cast<PHINode>(User)) {
for (int i = 0, e = PH->getNumIncomingValues(); i != e; ++i) {
if (PH->getIncomingValue(i) == Scalar) {
TerminatorInst *IncomingTerminator =
PH->getIncomingBlock(i)->getTerminator();
if (isa<CatchSwitchInst>(IncomingTerminator)) {
Builder.SetInsertPoint(VecI->getParent(),
std::next(VecI->getIterator()));
} else {
Builder.SetInsertPoint(PH->getIncomingBlock(i)->getTerminator());
}
Value *Ex = Builder.CreateExtractElement(Vec, Lane);
Ex = extend(ScalarRoot, Ex, Scalar->getType());
CSEBlocks.insert(PH->getIncomingBlock(i));
PH->setOperand(i, Ex);
}
}
} else {
Builder.SetInsertPoint(cast<Instruction>(User));
Value *Ex = Builder.CreateExtractElement(Vec, Lane);
Ex = extend(ScalarRoot, Ex, Scalar->getType());
CSEBlocks.insert(cast<Instruction>(User)->getParent());
User->replaceUsesOfWith(Scalar, Ex);
}
} else {
Builder.SetInsertPoint(&F->getEntryBlock().front());
Value *Ex = Builder.CreateExtractElement(Vec, Lane);
Ex = extend(ScalarRoot, Ex, Scalar->getType());
CSEBlocks.insert(&F->getEntryBlock());
User->replaceUsesOfWith(Scalar, Ex);
}
DEBUG(dbgs() << "SLP: Replaced:" << *User << ".\n");
}
// For each vectorized value:
for (TreeEntry &EIdx : VectorizableTree) {
TreeEntry *Entry = &EIdx;
// For each lane:
for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
Value *Scalar = Entry->Scalars[Lane];
// No need to handle users of gathered values.
if (Entry->NeedToGather)
continue;
assert(Entry->VectorizedValue && "Can't find vectorizable value");
Type *Ty = Scalar->getType();
if (!Ty->isVoidTy()) {
#ifndef NDEBUG
for (User *U : Scalar->users()) {
DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n");
assert((ScalarToTreeEntry.count(U) ||
// It is legal to replace users in the ignorelist by undef.
is_contained(UserIgnoreList, U)) &&
"Replacing out-of-tree value with undef");
}
#endif
Value *Undef = UndefValue::get(Ty);
Scalar->replaceAllUsesWith(Undef);
}
DEBUG(dbgs() << "SLP: \tErasing scalar:" << *Scalar << ".\n");
eraseInstruction(cast<Instruction>(Scalar));
}
}
Builder.ClearInsertionPoint();
return VectorizableTree[0].VectorizedValue;
}
void BoUpSLP::optimizeGatherSequence() {
DEBUG(dbgs() << "SLP: Optimizing " << GatherSeq.size()
<< " gather sequences instructions.\n");
// LICM InsertElementInst sequences.
for (Instruction *it : GatherSeq) {
InsertElementInst *Insert = dyn_cast<InsertElementInst>(it);
if (!Insert)
continue;
// Check if this block is inside a loop.
Loop *L = LI->getLoopFor(Insert->getParent());
if (!L)
continue;
// Check if it has a preheader.
BasicBlock *PreHeader = L->getLoopPreheader();
if (!PreHeader)
continue;
// If the vector or the element that we insert into it are
// instructions that are defined in this basic block then we can't
// hoist this instruction.
Instruction *CurrVec = dyn_cast<Instruction>(Insert->getOperand(0));
Instruction *NewElem = dyn_cast<Instruction>(Insert->getOperand(1));
if (CurrVec && L->contains(CurrVec))
continue;
if (NewElem && L->contains(NewElem))
continue;
// We can hoist this instruction. Move it to the pre-header.
Insert->moveBefore(PreHeader->getTerminator());
}
// Make a list of all reachable blocks in our CSE queue.
SmallVector<const DomTreeNode *, 8> CSEWorkList;
CSEWorkList.reserve(CSEBlocks.size());
for (BasicBlock *BB : CSEBlocks)
if (DomTreeNode *N = DT->getNode(BB)) {
assert(DT->isReachableFromEntry(N));
CSEWorkList.push_back(N);
}
// Sort blocks by domination. This ensures we visit a block after all blocks
// dominating it are visited.
std::stable_sort(CSEWorkList.begin(), CSEWorkList.end(),
[this](const DomTreeNode *A, const DomTreeNode *B) {
return DT->properlyDominates(A, B);
});
// Perform O(N^2) search over the gather sequences and merge identical
// instructions. TODO: We can further optimize this scan if we split the
// instructions into different buckets based on the insert lane.
SmallVector<Instruction *, 16> Visited;
for (auto I = CSEWorkList.begin(), E = CSEWorkList.end(); I != E; ++I) {
assert((I == CSEWorkList.begin() || !DT->dominates(*I, *std::prev(I))) &&
"Worklist not sorted properly!");
BasicBlock *BB = (*I)->getBlock();
// For all instructions in blocks containing gather sequences:
for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e;) {
Instruction *In = &*it++;
if (!isa<InsertElementInst>(In) && !isa<ExtractElementInst>(In))
continue;
// Check if we can replace this instruction with any of the
// visited instructions.
for (Instruction *v : Visited) {
if (In->isIdenticalTo(v) &&
DT->dominates(v->getParent(), In->getParent())) {
In->replaceAllUsesWith(v);
eraseInstruction(In);
In = nullptr;
break;
}
}
if (In) {
assert(!is_contained(Visited, In));
Visited.push_back(In);
}
}
}
CSEBlocks.clear();
GatherSeq.clear();
}
// Groups the instructions to a bundle (which is then a single scheduling entity)
// and schedules instructions until the bundle gets ready.
bool BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL,
BoUpSLP *SLP) {
if (isa<PHINode>(VL[0]))
return true;
// Initialize the instruction bundle.
Instruction *OldScheduleEnd = ScheduleEnd;
ScheduleData *PrevInBundle = nullptr;
ScheduleData *Bundle = nullptr;
bool ReSchedule = false;
DEBUG(dbgs() << "SLP: bundle: " << *VL[0] << "\n");
// Make sure that the scheduling region contains all
// instructions of the bundle.
for (Value *V : VL) {
if (!extendSchedulingRegion(V))
return false;
}
for (Value *V : VL) {
ScheduleData *BundleMember = getScheduleData(V);
assert(BundleMember &&
"no ScheduleData for bundle member (maybe not in same basic block)");
if (BundleMember->IsScheduled) {
// A bundle member was scheduled as single instruction before and now
// needs to be scheduled as part of the bundle. We just get rid of the
// existing schedule.
DEBUG(dbgs() << "SLP: reset schedule because " << *BundleMember
<< " was already scheduled\n");
ReSchedule = true;
}
assert(BundleMember->isSchedulingEntity() &&
"bundle member already part of other bundle");
if (PrevInBundle) {
PrevInBundle->NextInBundle = BundleMember;
} else {
Bundle = BundleMember;
}
BundleMember->UnscheduledDepsInBundle = 0;
Bundle->UnscheduledDepsInBundle += BundleMember->UnscheduledDeps;
// Group the instructions to a bundle.
BundleMember->FirstInBundle = Bundle;
PrevInBundle = BundleMember;
}
if (ScheduleEnd != OldScheduleEnd) {
// The scheduling region got new instructions at the lower end (or it is a
// new region for the first bundle). This makes it necessary to
// recalculate all dependencies.
// It is seldom that this needs to be done a second time after adding the
// initial bundle to the region.
for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
ScheduleData *SD = getScheduleData(I);
SD->clearDependencies();
}
ReSchedule = true;
}
if (ReSchedule) {
resetSchedule();
initialFillReadyList(ReadyInsts);
}
DEBUG(dbgs() << "SLP: try schedule bundle " << *Bundle << " in block "
<< BB->getName() << "\n");
calculateDependencies(Bundle, true, SLP);
// Now try to schedule the new bundle. As soon as the bundle is "ready" it
// means that there are no cyclic dependencies and we can schedule it.
// Note that's important that we don't "schedule" the bundle yet (see
// cancelScheduling).
while (!Bundle->isReady() && !ReadyInsts.empty()) {
ScheduleData *pickedSD = ReadyInsts.back();
ReadyInsts.pop_back();
if (pickedSD->isSchedulingEntity() && pickedSD->isReady()) {
schedule(pickedSD, ReadyInsts);
}
}
if (!Bundle->isReady()) {
cancelScheduling(VL);
return false;
}
return true;
}
void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef<Value *> VL) {
if (isa<PHINode>(VL[0]))
return;
ScheduleData *Bundle = getScheduleData(VL[0]);
DEBUG(dbgs() << "SLP: cancel scheduling of " << *Bundle << "\n");
assert(!Bundle->IsScheduled &&
"Can't cancel bundle which is already scheduled");
assert(Bundle->isSchedulingEntity() && Bundle->isPartOfBundle() &&
"tried to unbundle something which is not a bundle");
// Un-bundle: make single instructions out of the bundle.
ScheduleData *BundleMember = Bundle;
while (BundleMember) {
assert(BundleMember->FirstInBundle == Bundle && "corrupt bundle links");
BundleMember->FirstInBundle = BundleMember;
ScheduleData *Next = BundleMember->NextInBundle;
BundleMember->NextInBundle = nullptr;
BundleMember->UnscheduledDepsInBundle = BundleMember->UnscheduledDeps;
if (BundleMember->UnscheduledDepsInBundle == 0) {
ReadyInsts.insert(BundleMember);
}
BundleMember = Next;
}
}
bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V) {
if (getScheduleData(V))
return true;
Instruction *I = dyn_cast<Instruction>(V);
assert(I && "bundle member must be an instruction");
assert(!isa<PHINode>(I) && "phi nodes don't need to be scheduled");
if (!ScheduleStart) {
// It's the first instruction in the new region.
initScheduleData(I, I->getNextNode(), nullptr, nullptr);
ScheduleStart = I;
ScheduleEnd = I->getNextNode();
assert(ScheduleEnd && "tried to vectorize a TerminatorInst?");
DEBUG(dbgs() << "SLP: initialize schedule region to " << *I << "\n");
return true;
}
// Search up and down at the same time, because we don't know if the new
// instruction is above or below the existing scheduling region.
BasicBlock::reverse_iterator UpIter =
++ScheduleStart->getIterator().getReverse();
BasicBlock::reverse_iterator UpperEnd = BB->rend();
BasicBlock::iterator DownIter = ScheduleEnd->getIterator();
BasicBlock::iterator LowerEnd = BB->end();
for (;;) {
if (++ScheduleRegionSize > ScheduleRegionSizeLimit) {
DEBUG(dbgs() << "SLP: exceeded schedule region size limit\n");
return false;
}
if (UpIter != UpperEnd) {
if (&*UpIter == I) {
initScheduleData(I, ScheduleStart, nullptr, FirstLoadStoreInRegion);
ScheduleStart = I;
DEBUG(dbgs() << "SLP: extend schedule region start to " << *I << "\n");
return true;
}
UpIter++;
}
if (DownIter != LowerEnd) {
if (&*DownIter == I) {
initScheduleData(ScheduleEnd, I->getNextNode(), LastLoadStoreInRegion,
nullptr);
ScheduleEnd = I->getNextNode();
assert(ScheduleEnd && "tried to vectorize a TerminatorInst?");
DEBUG(dbgs() << "SLP: extend schedule region end to " << *I << "\n");
return true;
}
DownIter++;
}
assert((UpIter != UpperEnd || DownIter != LowerEnd) &&
"instruction not found in block");
}
return true;
}
void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI,
Instruction *ToI,
ScheduleData *PrevLoadStore,
ScheduleData *NextLoadStore) {
ScheduleData *CurrentLoadStore = PrevLoadStore;
for (Instruction *I = FromI; I != ToI; I = I->getNextNode()) {
ScheduleData *SD = ScheduleDataMap[I];
if (!SD) {
// Allocate a new ScheduleData for the instruction.
if (ChunkPos >= ChunkSize) {
ScheduleDataChunks.push_back(
llvm::make_unique<ScheduleData[]>(ChunkSize));
ChunkPos = 0;
}
SD = &(ScheduleDataChunks.back()[ChunkPos++]);
ScheduleDataMap[I] = SD;
SD->Inst = I;
}
assert(!isInSchedulingRegion(SD) &&
"new ScheduleData already in scheduling region");
SD->init(SchedulingRegionID);
if (I->mayReadOrWriteMemory()) {
// Update the linked list of memory accessing instructions.
if (CurrentLoadStore) {
CurrentLoadStore->NextLoadStore = SD;
} else {
FirstLoadStoreInRegion = SD;
}
CurrentLoadStore = SD;
}
}
if (NextLoadStore) {
if (CurrentLoadStore)
CurrentLoadStore->NextLoadStore = NextLoadStore;
} else {
LastLoadStoreInRegion = CurrentLoadStore;
}
}
void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
bool InsertInReadyList,
BoUpSLP *SLP) {
assert(SD->isSchedulingEntity());
SmallVector<ScheduleData *, 10> WorkList;
WorkList.push_back(SD);
while (!WorkList.empty()) {
ScheduleData *SD = WorkList.back();
WorkList.pop_back();
ScheduleData *BundleMember = SD;
while (BundleMember) {
assert(isInSchedulingRegion(BundleMember));
if (!BundleMember->hasValidDependencies()) {
DEBUG(dbgs() << "SLP: update deps of " << *BundleMember << "\n");
BundleMember->Dependencies = 0;
BundleMember->resetUnscheduledDeps();
// Handle def-use chain dependencies.
for (User *U : BundleMember->Inst->users()) {
if (isa<Instruction>(U)) {
ScheduleData *UseSD = getScheduleData(U);
if (UseSD && isInSchedulingRegion(UseSD->FirstInBundle)) {
BundleMember->Dependencies++;
ScheduleData *DestBundle = UseSD->FirstInBundle;
if (!DestBundle->IsScheduled) {
BundleMember->incrementUnscheduledDeps(1);
}
if (!DestBundle->hasValidDependencies()) {
WorkList.push_back(DestBundle);
}
}
} else {
// I'm not sure if this can ever happen. But we need to be safe.
// This lets the instruction/bundle never be scheduled and
// eventually disable vectorization.
BundleMember->Dependencies++;
BundleMember->incrementUnscheduledDeps(1);
}
}
// Handle the memory dependencies.
ScheduleData *DepDest = BundleMember->NextLoadStore;
if (DepDest) {
Instruction *SrcInst = BundleMember->Inst;
MemoryLocation SrcLoc = getLocation(SrcInst, SLP->AA);
bool SrcMayWrite = BundleMember->Inst->mayWriteToMemory();
unsigned numAliased = 0;
unsigned DistToSrc = 1;
while (DepDest) {
assert(isInSchedulingRegion(DepDest));
// We have two limits to reduce the complexity:
// 1) AliasedCheckLimit: It's a small limit to reduce calls to
// SLP->isAliased (which is the expensive part in this loop).
// 2) MaxMemDepDistance: It's for very large blocks and it aborts
// the whole loop (even if the loop is fast, it's quadratic).
// It's important for the loop break condition (see below) to
// check this limit even between two read-only instructions.
if (DistToSrc >= MaxMemDepDistance ||
((SrcMayWrite || DepDest->Inst->mayWriteToMemory()) &&
(numAliased >= AliasedCheckLimit ||
SLP->isAliased(SrcLoc, SrcInst, DepDest->Inst)))) {
// We increment the counter only if the locations are aliased
// (instead of counting all alias checks). This gives a better
// balance between reduced runtime and accurate dependencies.
numAliased++;
DepDest->MemoryDependencies.push_back(BundleMember);
BundleMember->Dependencies++;
ScheduleData *DestBundle = DepDest->FirstInBundle;
if (!DestBundle->IsScheduled) {
BundleMember->incrementUnscheduledDeps(1);
}
if (!DestBundle->hasValidDependencies()) {
WorkList.push_back(DestBundle);
}
}
DepDest = DepDest->NextLoadStore;
// Example, explaining the loop break condition: Let's assume our
// starting instruction is i0 and MaxMemDepDistance = 3.
//
// +--------v--v--v
// i0,i1,i2,i3,i4,i5,i6,i7,i8
// +--------^--^--^
//
// MaxMemDepDistance let us stop alias-checking at i3 and we add
// dependencies from i0 to i3,i4,.. (even if they are not aliased).
// Previously we already added dependencies from i3 to i6,i7,i8
// (because of MaxMemDepDistance). As we added a dependency from
// i0 to i3, we have transitive dependencies from i0 to i6,i7,i8
// and we can abort this loop at i6.
if (DistToSrc >= 2 * MaxMemDepDistance)
break;
DistToSrc++;
}
}
}
BundleMember = BundleMember->NextInBundle;
}
if (InsertInReadyList && SD->isReady()) {
ReadyInsts.push_back(SD);
DEBUG(dbgs() << "SLP: gets ready on update: " << *SD->Inst << "\n");
}
}
}
void BoUpSLP::BlockScheduling::resetSchedule() {
assert(ScheduleStart &&
"tried to reset schedule on block which has not been scheduled");
for (Instruction *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
ScheduleData *SD = getScheduleData(I);
assert(isInSchedulingRegion(SD));
SD->IsScheduled = false;
SD->resetUnscheduledDeps();
}
ReadyInsts.clear();
}
void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
if (!BS->ScheduleStart)
return;
DEBUG(dbgs() << "SLP: schedule block " << BS->BB->getName() << "\n");
BS->resetSchedule();
// For the real scheduling we use a more sophisticated ready-list: it is
// sorted by the original instruction location. This lets the final schedule
// be as close as possible to the original instruction order.
struct ScheduleDataCompare {
bool operator()(ScheduleData *SD1, ScheduleData *SD2) {
return SD2->SchedulingPriority < SD1->SchedulingPriority;
}
};
std::set<ScheduleData *, ScheduleDataCompare> ReadyInsts;
// Ensure that all dependency data is updated and fill the ready-list with
// initial instructions.
int Idx = 0;
int NumToSchedule = 0;
for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd;
I = I->getNextNode()) {
ScheduleData *SD = BS->getScheduleData(I);
assert(
SD->isPartOfBundle() == (ScalarToTreeEntry.count(SD->Inst) != 0) &&
"scheduler and vectorizer have different opinion on what is a bundle");
SD->FirstInBundle->SchedulingPriority = Idx++;
if (SD->isSchedulingEntity()) {
BS->calculateDependencies(SD, false, this);
NumToSchedule++;
}
}
BS->initialFillReadyList(ReadyInsts);
Instruction *LastScheduledInst = BS->ScheduleEnd;
// Do the "real" scheduling.
while (!ReadyInsts.empty()) {
ScheduleData *picked = *ReadyInsts.begin();
ReadyInsts.erase(ReadyInsts.begin());
// Move the scheduled instruction(s) to their dedicated places, if not
// there yet.
ScheduleData *BundleMember = picked;
while (BundleMember) {
Instruction *pickedInst = BundleMember->Inst;
if (LastScheduledInst->getNextNode() != pickedInst) {
BS->BB->getInstList().remove(pickedInst);
BS->BB->getInstList().insert(LastScheduledInst->getIterator(),
pickedInst);
}
LastScheduledInst = pickedInst;
BundleMember = BundleMember->NextInBundle;
}
BS->schedule(picked, ReadyInsts);
NumToSchedule--;
}
assert(NumToSchedule == 0 && "could not schedule all instructions");
// Avoid duplicate scheduling of the block.
BS->ScheduleStart = nullptr;
}
unsigned BoUpSLP::getVectorElementSize(Value *V) {
// If V is a store, just return the width of the stored value without
// traversing the expression tree. This is the common case.
if (auto *Store = dyn_cast<StoreInst>(V))
return DL->getTypeSizeInBits(Store->getValueOperand()->getType());
// If V is not a store, we can traverse the expression tree to find loads
// that feed it. The type of the loaded value may indicate a more suitable
// width than V's type. We want to base the vector element size on the width
// of memory operations where possible.
SmallVector<Instruction *, 16> Worklist;
SmallPtrSet<Instruction *, 16> Visited;
if (auto *I = dyn_cast<Instruction>(V))
Worklist.push_back(I);
// Traverse the expression tree in bottom-up order looking for loads. If we
// encounter an instruciton we don't yet handle, we give up.
auto MaxWidth = 0u;
auto FoundUnknownInst = false;
while (!Worklist.empty() && !FoundUnknownInst) {
auto *I = Worklist.pop_back_val();
Visited.insert(I);
// We should only be looking at scalar instructions here. If the current
// instruction has a vector type, give up.
auto *Ty = I->getType();
if (isa<VectorType>(Ty))
FoundUnknownInst = true;
// If the current instruction is a load, update MaxWidth to reflect the
// width of the loaded value.
else if (isa<LoadInst>(I))
MaxWidth = std::max<unsigned>(MaxWidth, DL->getTypeSizeInBits(Ty));
// Otherwise, we need to visit the operands of the instruction. We only
// handle the interesting cases from buildTree here. If an operand is an
// instruction we haven't yet visited, we add it to the worklist.
else if (isa<PHINode>(I) || isa<CastInst>(I) || isa<GetElementPtrInst>(I) ||
isa<CmpInst>(I) || isa<SelectInst>(I) || isa<BinaryOperator>(I)) {
for (Use &U : I->operands())
if (auto *J = dyn_cast<Instruction>(U.get()))
if (!Visited.count(J))
Worklist.push_back(J);
}
// If we don't yet handle the instruction, give up.
else
FoundUnknownInst = true;
}
// If we didn't encounter a memory access in the expression tree, or if we
// gave up for some reason, just return the width of V.
if (!MaxWidth || FoundUnknownInst)
return DL->getTypeSizeInBits(V->getType());
// Otherwise, return the maximum width we found.
return MaxWidth;
}
// Determine if a value V in a vectorizable expression Expr can be demoted to a
// smaller type with a truncation. We collect the values that will be demoted
// in ToDemote and additional roots that require investigating in Roots.
static bool collectValuesToDemote(Value *V, SmallPtrSetImpl<Value *> &Expr,
SmallVectorImpl<Value *> &ToDemote,
SmallVectorImpl<Value *> &Roots) {
// We can always demote constants.
if (isa<Constant>(V)) {
ToDemote.push_back(V);
return true;
}
// If the value is not an instruction in the expression with only one use, it
// cannot be demoted.
auto *I = dyn_cast<Instruction>(V);
if (!I || !I->hasOneUse() || !Expr.count(I))
return false;
switch (I->getOpcode()) {
// We can always demote truncations and extensions. Since truncations can
// seed additional demotion, we save the truncated value.
case Instruction::Trunc:
Roots.push_back(I->getOperand(0));
case Instruction::ZExt:
case Instruction::SExt:
break;
// We can demote certain binary operations if we can demote both of their
// operands.
case Instruction::Add:
case Instruction::Sub:
case Instruction::Mul:
case Instruction::And:
case Instruction::Or:
case Instruction::Xor:
if (!collectValuesToDemote(I->getOperand(0), Expr, ToDemote, Roots) ||
!collectValuesToDemote(I->getOperand(1), Expr, ToDemote, Roots))
return false;
break;
// We can demote selects if we can demote their true and false values.
case Instruction::Select: {
SelectInst *SI = cast<SelectInst>(I);
if (!collectValuesToDemote(SI->getTrueValue(), Expr, ToDemote, Roots) ||
!collectValuesToDemote(SI->getFalseValue(), Expr, ToDemote, Roots))
return false;
break;
}
// We can demote phis if we can demote all their incoming operands. Note that
// we don't need to worry about cycles since we ensure single use above.
case Instruction::PHI: {
PHINode *PN = cast<PHINode>(I);
for (Value *IncValue : PN->incoming_values())
if (!collectValuesToDemote(IncValue, Expr, ToDemote, Roots))
return false;
break;
}
// Otherwise, conservatively give up.
default:
return false;
}
// Record the value that we can demote.
ToDemote.push_back(V);
return true;
}
void BoUpSLP::computeMinimumValueSizes() {
// If there are no external uses, the expression tree must be rooted by a
// store. We can't demote in-memory values, so there is nothing to do here.
if (ExternalUses.empty())
return;
// We only attempt to truncate integer expressions.
auto &TreeRoot = VectorizableTree[0].Scalars;
auto *TreeRootIT = dyn_cast<IntegerType>(TreeRoot[0]->getType());
if (!TreeRootIT)
return;
// If the expression is not rooted by a store, these roots should have
// external uses. We will rely on InstCombine to rewrite the expression in
// the narrower type. However, InstCombine only rewrites single-use values.
// This means that if a tree entry other than a root is used externally, it
// must have multiple uses and InstCombine will not rewrite it. The code
// below ensures that only the roots are used externally.
SmallPtrSet<Value *, 32> Expr(TreeRoot.begin(), TreeRoot.end());
for (auto &EU : ExternalUses)
if (!Expr.erase(EU.Scalar))
return;
if (!Expr.empty())
return;
// Collect the scalar values of the vectorizable expression. We will use this
// context to determine which values can be demoted. If we see a truncation,
// we mark it as seeding another demotion.
for (auto &Entry : VectorizableTree)
Expr.insert(Entry.Scalars.begin(), Entry.Scalars.end());
// Ensure the roots of the vectorizable tree don't form a cycle. They must
// have a single external user that is not in the vectorizable tree.
for (auto *Root : TreeRoot)
if (!Root->hasOneUse() || Expr.count(*Root->user_begin()))
return;
// Conservatively determine if we can actually truncate the roots of the
// expression. Collect the values that can be demoted in ToDemote and
// additional roots that require investigating in Roots.
SmallVector<Value *, 32> ToDemote;
SmallVector<Value *, 4> Roots;
for (auto *Root : TreeRoot)
if (!collectValuesToDemote(Root, Expr, ToDemote, Roots))
return;
// The maximum bit width required to represent all the values that can be
// demoted without loss of precision. It would be safe to truncate the roots
// of the expression to this width.
auto MaxBitWidth = 8u;
// We first check if all the bits of the roots are demanded. If they're not,
// we can truncate the roots to this narrower type.
for (auto *Root : TreeRoot) {
auto Mask = DB->getDemandedBits(cast<Instruction>(Root));
MaxBitWidth = std::max<unsigned>(
Mask.getBitWidth() - Mask.countLeadingZeros(), MaxBitWidth);
}
// True if the roots can be zero-extended back to their original type, rather
// than sign-extended. We know that if the leading bits are not demanded, we
// can safely zero-extend. So we initialize IsKnownPositive to True.
bool IsKnownPositive = true;
// If all the bits of the roots are demanded, we can try a little harder to
// compute a narrower type. This can happen, for example, if the roots are
// getelementptr indices. InstCombine promotes these indices to the pointer
// width. Thus, all their bits are technically demanded even though the
// address computation might be vectorized in a smaller type.
//
// We start by looking at each entry that can be demoted. We compute the
// maximum bit width required to store the scalar by using ValueTracking to
// compute the number of high-order bits we can truncate.
if (MaxBitWidth == DL->getTypeSizeInBits(TreeRoot[0]->getType())) {
MaxBitWidth = 8u;
// Determine if the sign bit of all the roots is known to be zero. If not,
// IsKnownPositive is set to False.
IsKnownPositive = all_of(TreeRoot, [&](Value *R) {
bool KnownZero = false;
bool KnownOne = false;
ComputeSignBit(R, KnownZero, KnownOne, *DL);
return KnownZero;
});
// Determine the maximum number of bits required to store the scalar
// values.
for (auto *Scalar : ToDemote) {
auto NumSignBits = ComputeNumSignBits(Scalar, *DL, 0, AC, 0, DT);
auto NumTypeBits = DL->getTypeSizeInBits(Scalar->getType());
MaxBitWidth = std::max<unsigned>(NumTypeBits - NumSignBits, MaxBitWidth);
}
// If we can't prove that the sign bit is zero, we must add one to the
// maximum bit width to account for the unknown sign bit. This preserves
// the existing sign bit so we can safely sign-extend the root back to the
// original type. Otherwise, if we know the sign bit is zero, we will
// zero-extend the root instead.
//
// FIXME: This is somewhat suboptimal, as there will be cases where adding
// one to the maximum bit width will yield a larger-than-necessary
// type. In general, we need to add an extra bit only if we can't
// prove that the upper bit of the original type is equal to the
// upper bit of the proposed smaller type. If these two bits are the
// same (either zero or one) we know that sign-extending from the
// smaller type will result in the same value. Here, since we can't
// yet prove this, we are just making the proposed smaller type
// larger to ensure correctness.
if (!IsKnownPositive)
++MaxBitWidth;
}
// Round MaxBitWidth up to the next power-of-two.
if (!isPowerOf2_64(MaxBitWidth))
MaxBitWidth = NextPowerOf2(MaxBitWidth);
// If the maximum bit width we compute is less than the with of the roots'
// type, we can proceed with the narrowing. Otherwise, do nothing.
if (MaxBitWidth >= TreeRootIT->getBitWidth())
return;
// If we can truncate the root, we must collect additional values that might
// be demoted as a result. That is, those seeded by truncations we will
// modify.
while (!Roots.empty())
collectValuesToDemote(Roots.pop_back_val(), Expr, ToDemote, Roots);
// Finally, map the values we can demote to the maximum bit with we computed.
for (auto *Scalar : ToDemote)
MinBWs[Scalar] = std::make_pair(MaxBitWidth, !IsKnownPositive);
}
namespace {
/// The SLPVectorizer Pass.
struct SLPVectorizer : public FunctionPass {
SLPVectorizerPass Impl;
/// Pass identification, replacement for typeid
static char ID;
explicit SLPVectorizer() : FunctionPass(ID) {
initializeSLPVectorizerPass(*PassRegistry::getPassRegistry());
}
bool doInitialization(Module &M) override {
return false;
}
bool runOnFunction(Function &F) override {
if (skipFunction(F))
return false;
auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
auto *TLI = TLIP ? &TLIP->getTLI() : nullptr;
auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
return Impl.runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB);
}
void getAnalysisUsage(AnalysisUsage &AU) const override {
FunctionPass::getAnalysisUsage(AU);
AU.addRequired<AssumptionCacheTracker>();
AU.addRequired<ScalarEvolutionWrapperPass>();
AU.addRequired<AAResultsWrapperPass>();
AU.addRequired<TargetTransformInfoWrapperPass>();
AU.addRequired<LoopInfoWrapperPass>();
AU.addRequired<DominatorTreeWrapperPass>();
AU.addRequired<DemandedBitsWrapperPass>();
AU.addPreserved<LoopInfoWrapperPass>();
AU.addPreserved<DominatorTreeWrapperPass>();
AU.addPreserved<AAResultsWrapperPass>();
AU.addPreserved<GlobalsAAWrapperPass>();
AU.setPreservesCFG();
}
};
} // end anonymous namespace
PreservedAnalyses SLPVectorizerPass::run(Function &F, FunctionAnalysisManager &AM) {
auto *SE = &AM.getResult<ScalarEvolutionAnalysis>(F);
auto *TTI = &AM.getResult<TargetIRAnalysis>(F);
auto *TLI = AM.getCachedResult<TargetLibraryAnalysis>(F);
auto *AA = &AM.getResult<AAManager>(F);
auto *LI = &AM.getResult<LoopAnalysis>(F);
auto *DT = &AM.getResult<DominatorTreeAnalysis>(F);
auto *AC = &AM.getResult<AssumptionAnalysis>(F);
auto *DB = &AM.getResult<DemandedBitsAnalysis>(F);
bool Changed = runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB);
if (!Changed)
return PreservedAnalyses::all();
PreservedAnalyses PA;
PA.preserve<LoopAnalysis>();
PA.preserve<DominatorTreeAnalysis>();
PA.preserve<AAManager>();
PA.preserve<GlobalsAA>();
return PA;
}
bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_,
TargetTransformInfo *TTI_,
TargetLibraryInfo *TLI_, AliasAnalysis *AA_,
LoopInfo *LI_, DominatorTree *DT_,
AssumptionCache *AC_, DemandedBits *DB_) {
SE = SE_;
TTI = TTI_;
TLI = TLI_;
AA = AA_;
LI = LI_;
DT = DT_;
AC = AC_;
DB = DB_;
DL = &F.getParent()->getDataLayout();
Stores.clear();
GEPs.clear();
bool Changed = false;
// If the target claims to have no vector registers don't attempt
// vectorization.
if (!TTI->getNumberOfRegisters(true))
return false;
// Don't vectorize when the attribute NoImplicitFloat is used.
if (F.hasFnAttribute(Attribute::NoImplicitFloat))
return false;
DEBUG(dbgs() << "SLP: Analyzing blocks in " << F.getName() << ".\n");
// Use the bottom up slp vectorizer to construct chains that start with
// store instructions.
BoUpSLP R(&F, SE, TTI, TLI, AA, LI, DT, AC, DB, DL);
// A general note: the vectorizer must use BoUpSLP::eraseInstruction() to
// delete instructions.
// Scan the blocks in the function in post order.
for (auto BB : post_order(&F.getEntryBlock())) {
collectSeedInstructions(BB);
// Vectorize trees that end at stores.
if (!Stores.empty()) {
DEBUG(dbgs() << "SLP: Found stores for " << Stores.size()
<< " underlying objects.\n");
Changed |= vectorizeStoreChains(R);
}
// Vectorize trees that end at reductions.
Changed |= vectorizeChainsInBlock(BB, R);
// Vectorize the index computations of getelementptr instructions. This
// is primarily intended to catch gather-like idioms ending at
// non-consecutive loads.
if (!GEPs.empty()) {
DEBUG(dbgs() << "SLP: Found GEPs for " << GEPs.size()
<< " underlying objects.\n");
Changed |= vectorizeGEPIndices(BB, R);
}
}
if (Changed) {
R.optimizeGatherSequence();
DEBUG(dbgs() << "SLP: vectorized \"" << F.getName() << "\"\n");
DEBUG(verifyFunction(F));
}
return Changed;
}
/// \brief Check that the Values in the slice in VL array are still existent in
/// the WeakVH array.
/// Vectorization of part of the VL array may cause later values in the VL array
/// to become invalid. We track when this has happened in the WeakVH array.
static bool hasValueBeenRAUWed(ArrayRef<Value *> VL, ArrayRef<WeakVH> VH,
unsigned SliceBegin, unsigned SliceSize) {
VL = VL.slice(SliceBegin, SliceSize);
VH = VH.slice(SliceBegin, SliceSize);
return !std::equal(VL.begin(), VL.end(), VH.begin());
}
bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
unsigned VecRegSize) {
unsigned ChainLen = Chain.size();
DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << ChainLen
<< "\n");
unsigned Sz = R.getVectorElementSize(Chain[0]);
unsigned VF = VecRegSize / Sz;
if (!isPowerOf2_32(Sz) || VF < 2)
return false;
// Keep track of values that were deleted by vectorizing in the loop below.
SmallVector<WeakVH, 8> TrackValues(Chain.begin(), Chain.end());
bool Changed = false;
// Look for profitable vectorizable trees at all offsets, starting at zero.
for (unsigned i = 0, e = ChainLen; i < e; ++i) {
if (i + VF > e)
break;
// Check that a previous iteration of this loop did not delete the Value.
if (hasValueBeenRAUWed(Chain, TrackValues, i, VF))
continue;
DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << i
<< "\n");
ArrayRef<Value *> Operands = Chain.slice(i, VF);
R.buildTree(Operands);
if (R.isTreeTinyAndNotFullyVectorizable())
continue;
R.computeMinimumValueSizes();
int Cost = R.getTreeCost();
DEBUG(dbgs() << "SLP: Found cost=" << Cost << " for VF=" << VF << "\n");
if (Cost < -SLPCostThreshold) {
DEBUG(dbgs() << "SLP: Decided to vectorize cost=" << Cost << "\n");
R.vectorizeTree();
// Move to the next bundle.
i += VF - 1;
Changed = true;
}
}
return Changed;
}
bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores,
BoUpSLP &R) {
SetVector<StoreInst *> Heads, Tails;
SmallDenseMap<StoreInst *, StoreInst *> ConsecutiveChain;
// We may run into multiple chains that merge into a single chain. We mark the
// stores that we vectorized so that we don't visit the same store twice.
BoUpSLP::ValueSet VectorizedStores;
bool Changed = false;
// Do a quadratic search on all of the given stores and find
// all of the pairs of stores that follow each other.
SmallVector<unsigned, 16> IndexQueue;
for (unsigned i = 0, e = Stores.size(); i < e; ++i) {
IndexQueue.clear();
// If a store has multiple consecutive store candidates, search Stores
// array according to the sequence: from i+1 to e, then from i-1 to 0.
// This is because usually pairing with immediate succeeding or preceding
// candidate create the best chance to find slp vectorization opportunity.
unsigned j = 0;
for (j = i + 1; j < e; ++j)
IndexQueue.push_back(j);
for (j = i; j > 0; --j)
IndexQueue.push_back(j - 1);
for (auto &k : IndexQueue) {
if (isConsecutiveAccess(Stores[i], Stores[k], *DL, *SE)) {
Tails.insert(Stores[k]);
Heads.insert(Stores[i]);
ConsecutiveChain[Stores[i]] = Stores[k];
break;
}
}
}
// For stores that start but don't end a link in the chain:
for (SetVector<StoreInst *>::iterator it = Heads.begin(), e = Heads.end();
it != e; ++it) {
if (Tails.count(*it))
continue;
// We found a store instr that starts a chain. Now follow the chain and try
// to vectorize it.
BoUpSLP::ValueList Operands;
StoreInst *I = *it;
// Collect the chain into a list.
while (Tails.count(I) || Heads.count(I)) {
if (VectorizedStores.count(I))
break;
Operands.push_back(I);
// Move to the next value in the chain.
I = ConsecutiveChain[I];
}
// FIXME: Is division-by-2 the correct step? Should we assert that the
// register size is a power-of-2?
for (unsigned Size = R.getMaxVecRegSize(); Size >= R.getMinVecRegSize();
Size /= 2) {
if (vectorizeStoreChain(Operands, R, Size)) {
// Mark the vectorized stores so that we don't vectorize them again.
VectorizedStores.insert(Operands.begin(), Operands.end());
Changed = true;
break;
}
}
}
return Changed;
}
void SLPVectorizerPass::collectSeedInstructions(BasicBlock *BB) {
// Initialize the collections. We will make a single pass over the block.
Stores.clear();
GEPs.clear();
// Visit the store and getelementptr instructions in BB and organize them in
// Stores and GEPs according to the underlying objects of their pointer
// operands.
for (Instruction &I : *BB) {
// Ignore store instructions that are volatile or have a pointer operand
// that doesn't point to a scalar type.
if (auto *SI = dyn_cast<StoreInst>(&I)) {
if (!SI->isSimple())
continue;
if (!isValidElementType(SI->getValueOperand()->getType()))
continue;
Stores[GetUnderlyingObject(SI->getPointerOperand(), *DL)].push_back(SI);
}
// Ignore getelementptr instructions that have more than one index, a
// constant index, or a pointer operand that doesn't point to a scalar
// type.
else if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {
auto Idx = GEP->idx_begin()->get();
if (GEP->getNumIndices() > 1 || isa<Constant>(Idx))
continue;
if (!isValidElementType(Idx->getType()))
continue;
if (GEP->getType()->isVectorTy())
continue;
GEPs[GetUnderlyingObject(GEP->getPointerOperand(), *DL)].push_back(GEP);
}
}
}
bool SLPVectorizerPass::tryToVectorizePair(Value *A, Value *B, BoUpSLP &R) {
if (!A || !B)
return false;
Value *VL[] = { A, B };
return tryToVectorizeList(VL, R, None, true);
}
bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
ArrayRef<Value *> BuildVector,
bool AllowReorder) {
if (VL.size() < 2)
return false;
DEBUG(dbgs() << "SLP: Trying to vectorize a list of length = " << VL.size()
<< ".\n");
// Check that all of the parts are scalar instructions of the same type.
Instruction *I0 = dyn_cast<Instruction>(VL[0]);
if (!I0)
return false;
unsigned Opcode0 = I0->getOpcode();
unsigned Sz = R.getVectorElementSize(I0);
unsigned MinVF = std::max(2U, R.getMinVecRegSize() / Sz);
unsigned MaxVF = std::max<unsigned>(PowerOf2Floor(VL.size()), MinVF);
if (MaxVF < 2)
return false;
for (Value *V : VL) {
Type *Ty = V->getType();
if (!isValidElementType(Ty))
return false;
Instruction *Inst = dyn_cast<Instruction>(V);
if (!Inst || Inst->getOpcode() != Opcode0)
return false;
}
bool Changed = false;
// Keep track of values that were deleted by vectorizing in the loop below.
SmallVector<WeakVH, 8> TrackValues(VL.begin(), VL.end());
unsigned NextInst = 0, MaxInst = VL.size();
for (unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF;
VF /= 2) {
// No actual vectorization should happen, if number of parts is the same as
// provided vectorization factor (i.e. the scalar type is used for vector
// code during codegen).
auto *VecTy = VectorType::get(VL[0]->getType(), VF);
if (TTI->getNumberOfParts(VecTy) == VF)
continue;
for (unsigned I = NextInst; I < MaxInst; ++I) {
unsigned OpsWidth = 0;
if (I + VF > MaxInst)
OpsWidth = MaxInst - I;
else
OpsWidth = VF;
if (!isPowerOf2_32(OpsWidth) || OpsWidth < 2)
break;
// Check that a previous iteration of this loop did not delete the Value.
if (hasValueBeenRAUWed(VL, TrackValues, I, OpsWidth))
continue;
DEBUG(dbgs() << "SLP: Analyzing " << OpsWidth << " operations "
<< "\n");
ArrayRef<Value *> Ops = VL.slice(I, OpsWidth);
ArrayRef<Value *> BuildVectorSlice;
if (!BuildVector.empty())
BuildVectorSlice = BuildVector.slice(I, OpsWidth);
R.buildTree(Ops, BuildVectorSlice);
// TODO: check if we can allow reordering for more cases.
if (AllowReorder && R.shouldReorder()) {
// Conceptually, there is nothing actually preventing us from trying to
// reorder a larger list. In fact, we do exactly this when vectorizing
// reductions. However, at this point, we only expect to get here from
// tryToVectorizePair().
assert(Ops.size() == 2);
assert(BuildVectorSlice.empty());
Value *ReorderedOps[] = {Ops[1], Ops[0]};
R.buildTree(ReorderedOps, None);
}
if (R.isTreeTinyAndNotFullyVectorizable())
continue;
R.computeMinimumValueSizes();
int Cost = R.getTreeCost();
if (Cost < -SLPCostThreshold) {
DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n");
Value *VectorizedRoot = R.vectorizeTree();
// Reconstruct the build vector by extracting the vectorized root. This
// way we handle the case where some elements of the vector are
// undefined.
// (return (inserelt <4 xi32> (insertelt undef (opd0) 0) (opd1) 2))
if (!BuildVectorSlice.empty()) {
// The insert point is the last build vector instruction. The
// vectorized root will precede it. This guarantees that we get an
// instruction. The vectorized tree could have been constant folded.
Instruction *InsertAfter = cast<Instruction>(BuildVectorSlice.back());
unsigned VecIdx = 0;
for (auto &V : BuildVectorSlice) {
IRBuilder<NoFolder> Builder(InsertAfter->getParent(),
++BasicBlock::iterator(InsertAfter));
Instruction *I = cast<Instruction>(V);
assert(isa<InsertElementInst>(I) || isa<InsertValueInst>(I));
Instruction *Extract =
cast<Instruction>(Builder.CreateExtractElement(
VectorizedRoot, Builder.getInt32(VecIdx++)));
I->setOperand(1, Extract);
I->removeFromParent();
I->insertAfter(Extract);
InsertAfter = I;
}
}
// Move to the next bundle.
I += VF - 1;
NextInst = I + 1;
Changed = true;
}
}
}
return Changed;
}
bool SLPVectorizerPass::tryToVectorize(BinaryOperator *V, BoUpSLP &R) {
if (!V)
return false;
- Value *P = V->getParent();
-
- // Vectorize in current basic block only.
- auto *Op0 = dyn_cast<Instruction>(V->getOperand(0));
- auto *Op1 = dyn_cast<Instruction>(V->getOperand(1));
- if (!Op0 || !Op1 || Op0->getParent() != P || Op1->getParent() != P)
- return false;
-
// Try to vectorize V.
- if (tryToVectorizePair(Op0, Op1, R))
+ if (tryToVectorizePair(V->getOperand(0), V->getOperand(1), R))
return true;
- auto *A = dyn_cast<BinaryOperator>(Op0);
- auto *B = dyn_cast<BinaryOperator>(Op1);
+ BinaryOperator *A = dyn_cast<BinaryOperator>(V->getOperand(0));
+ BinaryOperator *B = dyn_cast<BinaryOperator>(V->getOperand(1));
// Try to skip B.
if (B && B->hasOneUse()) {
- auto *B0 = dyn_cast<BinaryOperator>(B->getOperand(0));
- auto *B1 = dyn_cast<BinaryOperator>(B->getOperand(1));
- if (B0 && B0->getParent() == P && tryToVectorizePair(A, B0, R))
+ BinaryOperator *B0 = dyn_cast<BinaryOperator>(B->getOperand(0));
+ BinaryOperator *B1 = dyn_cast<BinaryOperator>(B->getOperand(1));
+ if (tryToVectorizePair(A, B0, R)) {
return true;
- if (B1 && B1->getParent() == P && tryToVectorizePair(A, B1, R))
+ }
+ if (tryToVectorizePair(A, B1, R)) {
return true;
+ }
}
// Try to skip A.
if (A && A->hasOneUse()) {
- auto *A0 = dyn_cast<BinaryOperator>(A->getOperand(0));
- auto *A1 = dyn_cast<BinaryOperator>(A->getOperand(1));
- if (A0 && A0->getParent() == P && tryToVectorizePair(A0, B, R))
+ BinaryOperator *A0 = dyn_cast<BinaryOperator>(A->getOperand(0));
+ BinaryOperator *A1 = dyn_cast<BinaryOperator>(A->getOperand(1));
+ if (tryToVectorizePair(A0, B, R)) {
return true;
- if (A1 && A1->getParent() == P && tryToVectorizePair(A1, B, R))
+ }
+ if (tryToVectorizePair(A1, B, R)) {
return true;
+ }
}
- return false;
+ return 0;
}
/// \brief Generate a shuffle mask to be used in a reduction tree.
///
/// \param VecLen The length of the vector to be reduced.
/// \param NumEltsToRdx The number of elements that should be reduced in the
/// vector.
/// \param IsPairwise Whether the reduction is a pairwise or splitting
/// reduction. A pairwise reduction will generate a mask of
/// <0,2,...> or <1,3,..> while a splitting reduction will generate
/// <2,3, undef,undef> for a vector of 4 and NumElts = 2.
/// \param IsLeft True will generate a mask of even elements, odd otherwise.
static Value *createRdxShuffleMask(unsigned VecLen, unsigned NumEltsToRdx,
bool IsPairwise, bool IsLeft,
IRBuilder<> &Builder) {
assert((IsPairwise || !IsLeft) && "Don't support a <0,1,undef,...> mask");
SmallVector<Constant *, 32> ShuffleMask(
VecLen, UndefValue::get(Builder.getInt32Ty()));
if (IsPairwise)
// Build a mask of 0, 2, ... (left) or 1, 3, ... (right).
for (unsigned i = 0; i != NumEltsToRdx; ++i)
ShuffleMask[i] = Builder.getInt32(2 * i + !IsLeft);
else
// Move the upper half of the vector to the lower half.
for (unsigned i = 0; i != NumEltsToRdx; ++i)
ShuffleMask[i] = Builder.getInt32(NumEltsToRdx + i);
return ConstantVector::get(ShuffleMask);
}
namespace {
/// Model horizontal reductions.
///
/// A horizontal reduction is a tree of reduction operations (currently add and
/// fadd) that has operations that can be put into a vector as its leaf.
/// For example, this tree:
///
/// mul mul mul mul
/// \ / \ /
/// + +
/// \ /
/// +
/// This tree has "mul" as its reduced values and "+" as its reduction
/// operations. A reduction might be feeding into a store or a binary operation
/// feeding a phi.
/// ...
/// \ /
/// +
/// |
/// phi +=
///
/// Or:
/// ...
/// \ /
/// +
/// |
/// *p =
///
class HorizontalReduction {
SmallVector<Value *, 16> ReductionOps;
SmallVector<Value *, 32> ReducedVals;
BinaryOperator *ReductionRoot;
// After successfull horizontal reduction vectorization attempt for PHI node
// vectorizer tries to update root binary op by combining vectorized tree and
// the ReductionPHI node. But during vectorization this ReductionPHI can be
// vectorized itself and replaced by the undef value, while the instruction
// itself is marked for deletion. This 'marked for deletion' PHI node then can
// be used in new binary operation, causing "Use still stuck around after Def
// is destroyed" crash upon PHI node deletion.
WeakVH ReductionPHI;
/// The opcode of the reduction.
unsigned ReductionOpcode;
/// The opcode of the values we perform a reduction on.
unsigned ReducedValueOpcode;
/// Should we model this reduction as a pairwise reduction tree or a tree that
/// splits the vector in halves and adds those halves.
bool IsPairwiseReduction;
public:
/// The width of one full horizontal reduction operation.
unsigned ReduxWidth;
/// Minimal width of available vector registers. It's used to determine
/// ReduxWidth.
unsigned MinVecRegSize;
HorizontalReduction(unsigned MinVecRegSize)
: ReductionRoot(nullptr), ReductionOpcode(0), ReducedValueOpcode(0),
IsPairwiseReduction(false), ReduxWidth(0),
MinVecRegSize(MinVecRegSize) {}
/// \brief Try to find a reduction tree.
bool matchAssociativeReduction(PHINode *Phi, BinaryOperator *B) {
assert((!Phi || is_contained(Phi->operands(), B)) &&
"Thi phi needs to use the binary operator");
// We could have a initial reductions that is not an add.
// r *= v1 + v2 + v3 + v4
// In such a case start looking for a tree rooted in the first '+'.
if (Phi) {
if (B->getOperand(0) == Phi) {
Phi = nullptr;
B = dyn_cast<BinaryOperator>(B->getOperand(1));
} else if (B->getOperand(1) == Phi) {
Phi = nullptr;
B = dyn_cast<BinaryOperator>(B->getOperand(0));
}
}
if (!B)
return false;
Type *Ty = B->getType();
if (!isValidElementType(Ty))
return false;
const DataLayout &DL = B->getModule()->getDataLayout();
ReductionOpcode = B->getOpcode();
ReducedValueOpcode = 0;
// FIXME: Register size should be a parameter to this function, so we can
// try different vectorization factors.
ReduxWidth = MinVecRegSize / DL.getTypeSizeInBits(Ty);
ReductionRoot = B;
ReductionPHI = Phi;
if (ReduxWidth < 4)
return false;
// We currently only support adds.
if (ReductionOpcode != Instruction::Add &&
ReductionOpcode != Instruction::FAdd)
return false;
// Post order traverse the reduction tree starting at B. We only handle true
// trees containing only binary operators or selects.
SmallVector<std::pair<Instruction *, unsigned>, 32> Stack;
Stack.push_back(std::make_pair(B, 0));
while (!Stack.empty()) {
Instruction *TreeN = Stack.back().first;
unsigned EdgeToVist = Stack.back().second++;
bool IsReducedValue = TreeN->getOpcode() != ReductionOpcode;
// Only handle trees in the current basic block.
if (TreeN->getParent() != B->getParent())
return false;
// Each tree node needs to have one user except for the ultimate
// reduction.
if (!TreeN->hasOneUse() && TreeN != B)
return false;
// Postorder vist.
if (EdgeToVist == 2 || IsReducedValue) {
if (IsReducedValue) {
// Make sure that the opcodes of the operations that we are going to
// reduce match.
if (!ReducedValueOpcode)
ReducedValueOpcode = TreeN->getOpcode();
else if (ReducedValueOpcode != TreeN->getOpcode())
return false;
ReducedVals.push_back(TreeN);
} else {
// We need to be able to reassociate the adds.
if (!TreeN->isAssociative())
return false;
ReductionOps.push_back(TreeN);
}
// Retract.
Stack.pop_back();
continue;
}
// Visit left or right.
Value *NextV = TreeN->getOperand(EdgeToVist);
if (NextV != Phi) {
auto *I = dyn_cast<Instruction>(NextV);
// Continue analysis if the next operand is a reduction operation or
// (possibly) a reduced value. If the reduced value opcode is not set,
// the first met operation != reduction operation is considered as the
// reduced value class.
if (I && (!ReducedValueOpcode || I->getOpcode() == ReducedValueOpcode ||
I->getOpcode() == ReductionOpcode)) {
if (!ReducedValueOpcode && I->getOpcode() != ReductionOpcode)
ReducedValueOpcode = I->getOpcode();
Stack.push_back(std::make_pair(I, 0));
continue;
}
return false;
}
}
return true;
}
/// \brief Attempt to vectorize the tree found by
/// matchAssociativeReduction.
bool tryToReduce(BoUpSLP &V, TargetTransformInfo *TTI) {
if (ReducedVals.empty())
return false;
unsigned NumReducedVals = ReducedVals.size();
if (NumReducedVals < ReduxWidth)
return false;
Value *VectorizedTree = nullptr;
IRBuilder<> Builder(ReductionRoot);
FastMathFlags Unsafe;
Unsafe.setUnsafeAlgebra();
Builder.setFastMathFlags(Unsafe);
unsigned i = 0;
for (; i < NumReducedVals - ReduxWidth + 1; i += ReduxWidth) {
auto VL = makeArrayRef(&ReducedVals[i], ReduxWidth);
V.buildTree(VL, ReductionOps);
if (V.shouldReorder()) {
SmallVector<Value *, 8> Reversed(VL.rbegin(), VL.rend());
V.buildTree(Reversed, ReductionOps);
}
if (V.isTreeTinyAndNotFullyVectorizable())
continue;
V.computeMinimumValueSizes();
// Estimate cost.
int Cost = V.getTreeCost() + getReductionCost(TTI, ReducedVals[i]);
if (Cost >= -SLPCostThreshold)
break;
DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:" << Cost
<< ". (HorRdx)\n");
// Vectorize a tree.
DebugLoc Loc = cast<Instruction>(ReducedVals[i])->getDebugLoc();
Value *VectorizedRoot = V.vectorizeTree();
// Emit a reduction.
Value *ReducedSubTree = emitReduction(VectorizedRoot, Builder);
if (VectorizedTree) {
Builder.SetCurrentDebugLocation(Loc);
VectorizedTree = createBinOp(Builder, ReductionOpcode, VectorizedTree,
ReducedSubTree, "bin.rdx");
} else
VectorizedTree = ReducedSubTree;
}
if (VectorizedTree) {
// Finish the reduction.
for (; i < NumReducedVals; ++i) {
Builder.SetCurrentDebugLocation(
cast<Instruction>(ReducedVals[i])->getDebugLoc());
VectorizedTree = createBinOp(Builder, ReductionOpcode, VectorizedTree,
ReducedVals[i]);
}
// Update users.
if (ReductionPHI && !isa<UndefValue>(ReductionPHI)) {
assert(ReductionRoot && "Need a reduction operation");
ReductionRoot->setOperand(0, VectorizedTree);
ReductionRoot->setOperand(1, ReductionPHI);
} else
ReductionRoot->replaceAllUsesWith(VectorizedTree);
}
return VectorizedTree != nullptr;
}
unsigned numReductionValues() const {
return ReducedVals.size();
}
private:
/// \brief Calculate the cost of a reduction.
int getReductionCost(TargetTransformInfo *TTI, Value *FirstReducedVal) {
Type *ScalarTy = FirstReducedVal->getType();
Type *VecTy = VectorType::get(ScalarTy, ReduxWidth);
int PairwiseRdxCost = TTI->getReductionCost(ReductionOpcode, VecTy, true);
int SplittingRdxCost = TTI->getReductionCost(ReductionOpcode, VecTy, false);
IsPairwiseReduction = PairwiseRdxCost < SplittingRdxCost;
int VecReduxCost = IsPairwiseReduction ? PairwiseRdxCost : SplittingRdxCost;
int ScalarReduxCost =
(ReduxWidth - 1) *
TTI->getArithmeticInstrCost(ReductionOpcode, ScalarTy);
DEBUG(dbgs() << "SLP: Adding cost " << VecReduxCost - ScalarReduxCost
<< " for reduction that starts with " << *FirstReducedVal
<< " (It is a "
<< (IsPairwiseReduction ? "pairwise" : "splitting")
<< " reduction)\n");
return VecReduxCost - ScalarReduxCost;
}
static Value *createBinOp(IRBuilder<> &Builder, unsigned Opcode, Value *L,
Value *R, const Twine &Name = "") {
if (Opcode == Instruction::FAdd)
return Builder.CreateFAdd(L, R, Name);
return Builder.CreateBinOp((Instruction::BinaryOps)Opcode, L, R, Name);
}
/// \brief Emit a horizontal reduction of the vectorized value.
Value *emitReduction(Value *VectorizedValue, IRBuilder<> &Builder) {
assert(VectorizedValue && "Need to have a vectorized tree node");
assert(isPowerOf2_32(ReduxWidth) &&
"We only handle power-of-two reductions for now");
Value *TmpVec = VectorizedValue;
for (unsigned i = ReduxWidth / 2; i != 0; i >>= 1) {
if (IsPairwiseReduction) {
Value *LeftMask =
createRdxShuffleMask(ReduxWidth, i, true, true, Builder);
Value *RightMask =
createRdxShuffleMask(ReduxWidth, i, true, false, Builder);
Value *LeftShuf = Builder.CreateShuffleVector(
TmpVec, UndefValue::get(TmpVec->getType()), LeftMask, "rdx.shuf.l");
Value *RightShuf = Builder.CreateShuffleVector(
TmpVec, UndefValue::get(TmpVec->getType()), (RightMask),
"rdx.shuf.r");
TmpVec = createBinOp(Builder, ReductionOpcode, LeftShuf, RightShuf,
"bin.rdx");
} else {
Value *UpperHalf =
createRdxShuffleMask(ReduxWidth, i, false, false, Builder);
Value *Shuf = Builder.CreateShuffleVector(
TmpVec, UndefValue::get(TmpVec->getType()), UpperHalf, "rdx.shuf");
TmpVec = createBinOp(Builder, ReductionOpcode, TmpVec, Shuf, "bin.rdx");
}
}
// The result is in the first element of the vector.
return Builder.CreateExtractElement(TmpVec, Builder.getInt32(0));
}
};
} // end anonymous namespace
/// \brief Recognize construction of vectors like
/// %ra = insertelement <4 x float> undef, float %s0, i32 0
/// %rb = insertelement <4 x float> %ra, float %s1, i32 1
/// %rc = insertelement <4 x float> %rb, float %s2, i32 2
/// %rd = insertelement <4 x float> %rc, float %s3, i32 3
///
/// Returns true if it matches
///
static bool findBuildVector(InsertElementInst *FirstInsertElem,
SmallVectorImpl<Value *> &BuildVector,
SmallVectorImpl<Value *> &BuildVectorOpds) {
if (!isa<UndefValue>(FirstInsertElem->getOperand(0)))
return false;
InsertElementInst *IE = FirstInsertElem;
while (true) {
BuildVector.push_back(IE);
BuildVectorOpds.push_back(IE->getOperand(1));
if (IE->use_empty())
return false;
InsertElementInst *NextUse = dyn_cast<InsertElementInst>(IE->user_back());
if (!NextUse)
return true;
// If this isn't the final use, make sure the next insertelement is the only
// use. It's OK if the final constructed vector is used multiple times
if (!IE->hasOneUse())
return false;
IE = NextUse;
}
return false;
}
/// \brief Like findBuildVector, but looks backwards for construction of aggregate.
///
/// \return true if it matches.
static bool findBuildAggregate(InsertValueInst *IV,
SmallVectorImpl<Value *> &BuildVector,
SmallVectorImpl<Value *> &BuildVectorOpds) {
if (!IV->hasOneUse())
return false;
Value *V = IV->getAggregateOperand();
if (!isa<UndefValue>(V)) {
InsertValueInst *I = dyn_cast<InsertValueInst>(V);
if (!I || !findBuildAggregate(I, BuildVector, BuildVectorOpds))
return false;
}
BuildVector.push_back(IV);
BuildVectorOpds.push_back(IV->getInsertedValueOperand());
return true;
}
static bool PhiTypeSorterFunc(Value *V, Value *V2) {
return V->getType() < V2->getType();
}
/// \brief Try and get a reduction value from a phi node.
///
/// Given a phi node \p P in a block \p ParentBB, consider possible reductions
/// if they come from either \p ParentBB or a containing loop latch.
///
/// \returns A candidate reduction value if possible, or \code nullptr \endcode
/// if not possible.
static Value *getReductionValue(const DominatorTree *DT, PHINode *P,
BasicBlock *ParentBB, LoopInfo *LI) {
// There are situations where the reduction value is not dominated by the
// reduction phi. Vectorizing such cases has been reported to cause
// miscompiles. See PR25787.
auto DominatedReduxValue = [&](Value *R) {
return (
dyn_cast<Instruction>(R) &&
DT->dominates(P->getParent(), dyn_cast<Instruction>(R)->getParent()));
};
Value *Rdx = nullptr;
// Return the incoming value if it comes from the same BB as the phi node.
if (P->getIncomingBlock(0) == ParentBB) {
Rdx = P->getIncomingValue(0);
} else if (P->getIncomingBlock(1) == ParentBB) {
Rdx = P->getIncomingValue(1);
}
if (Rdx && DominatedReduxValue(Rdx))
return Rdx;
// Otherwise, check whether we have a loop latch to look at.
Loop *BBL = LI->getLoopFor(ParentBB);
if (!BBL)
return nullptr;
BasicBlock *BBLatch = BBL->getLoopLatch();
if (!BBLatch)
return nullptr;
// There is a loop latch, return the incoming value if it comes from
// that. This reduction pattern occasionally turns up.
if (P->getIncomingBlock(0) == BBLatch) {
Rdx = P->getIncomingValue(0);
} else if (P->getIncomingBlock(1) == BBLatch) {
Rdx = P->getIncomingValue(1);
}
if (Rdx && DominatedReduxValue(Rdx))
return Rdx;
return nullptr;
}
-namespace {
-/// Tracks instructons and its children.
-class WeakVHWithLevel final : public CallbackVH {
- /// Operand index of the instruction currently beeing analized.
- unsigned Level = 0;
- /// Is this the instruction that should be vectorized, or are we now
- /// processing children (i.e. operands of this instruction) for potential
- /// vectorization?
- bool IsInitial = true;
-
-public:
- explicit WeakVHWithLevel() = default;
- WeakVHWithLevel(Value *V) : CallbackVH(V){};
- /// Restart children analysis each time it is repaced by the new instruction.
- void allUsesReplacedWith(Value *New) override {
- setValPtr(New);
- Level = 0;
- IsInitial = true;
- }
- /// Check if the instruction was not deleted during vectorization.
- bool isValid() const { return !getValPtr(); }
- /// Is the istruction itself must be vectorized?
- bool isInitial() const { return IsInitial; }
- /// Try to vectorize children.
- void clearInitial() { IsInitial = false; }
- /// Are all children processed already?
- bool isFinal() const {
- assert(getValPtr() &&
- (isa<Instruction>(getValPtr()) &&
- cast<Instruction>(getValPtr())->getNumOperands() >= Level));
- return getValPtr() &&
- cast<Instruction>(getValPtr())->getNumOperands() == Level;
- }
- /// Get next child operation.
- Value *nextOperand() {
- assert(getValPtr() && isa<Instruction>(getValPtr()) &&
- cast<Instruction>(getValPtr())->getNumOperands() > Level);
- return cast<Instruction>(getValPtr())->getOperand(Level++);
- }
- virtual ~WeakVHWithLevel() = default;
-};
-} // namespace
-
/// \brief Attempt to reduce a horizontal reduction.
/// If it is legal to match a horizontal reduction feeding
-/// the phi node P with reduction operators Root in a basic block BB, then check
-/// if it can be done.
+/// the phi node P with reduction operators BI, then check if it
+/// can be done.
/// \returns true if a horizontal reduction was matched and reduced.
/// \returns false if a horizontal reduction was not matched.
-static bool canBeVectorized(
- PHINode *P, Instruction *Root, BasicBlock *BB, BoUpSLP &R,
- TargetTransformInfo *TTI,
- const function_ref<bool(BinaryOperator *, BoUpSLP &)> Vectorize) {
+static bool canMatchHorizontalReduction(PHINode *P, BinaryOperator *BI,
+ BoUpSLP &R, TargetTransformInfo *TTI,
+ unsigned MinRegSize) {
if (!ShouldVectorizeHor)
return false;
- if (!Root)
+ HorizontalReduction HorRdx(MinRegSize);
+ if (!HorRdx.matchAssociativeReduction(P, BI))
return false;
- if (Root->getParent() != BB)
- return false;
- SmallVector<WeakVHWithLevel, 8> Stack(1, Root);
- SmallSet<Value *, 8> VisitedInstrs;
- bool Res = false;
- while (!Stack.empty()) {
- Value *V = Stack.back();
- if (!V) {
- Stack.pop_back();
- continue;
- }
- auto *Inst = dyn_cast<Instruction>(V);
- if (!Inst || isa<PHINode>(Inst)) {
- Stack.pop_back();
- continue;
- }
- if (Stack.back().isInitial()) {
- Stack.back().clearInitial();
- if (auto *BI = dyn_cast<BinaryOperator>(Inst)) {
- HorizontalReduction HorRdx(R.getMinVecRegSize());
- if (HorRdx.matchAssociativeReduction(P, BI)) {
- // If there is a sufficient number of reduction values, reduce
- // to a nearby power-of-2. Can safely generate oversized
- // vectors and rely on the backend to split them to legal sizes.
- HorRdx.ReduxWidth =
- std::max((uint64_t)4, PowerOf2Floor(HorRdx.numReductionValues()));
+ // If there is a sufficient number of reduction values, reduce
+ // to a nearby power-of-2. Can safely generate oversized
+ // vectors and rely on the backend to split them to legal sizes.
+ HorRdx.ReduxWidth =
+ std::max((uint64_t)4, PowerOf2Floor(HorRdx.numReductionValues()));
- if (HorRdx.tryToReduce(R, TTI)) {
- Res = true;
- P = nullptr;
- continue;
- }
- }
- if (P) {
- Inst = dyn_cast<Instruction>(BI->getOperand(0));
- if (Inst == P)
- Inst = dyn_cast<Instruction>(BI->getOperand(1));
- if (!Inst) {
- P = nullptr;
- continue;
- }
- }
- }
- P = nullptr;
- if (Vectorize(dyn_cast<BinaryOperator>(Inst), R)) {
- Res = true;
- continue;
- }
- }
- if (Stack.back().isFinal()) {
- Stack.pop_back();
- continue;
- }
-
- if (auto *NextV = dyn_cast<Instruction>(Stack.back().nextOperand()))
- if (NextV->getParent() == BB && VisitedInstrs.insert(NextV).second &&
- Stack.size() < RecursionMaxDepth)
- Stack.push_back(NextV);
- }
- return Res;
+ return HorRdx.tryToReduce(R, TTI);
}
-bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *P, Value *V,
- BasicBlock *BB, BoUpSLP &R,
- TargetTransformInfo *TTI) {
- if (!V)
- return false;
- auto *I = dyn_cast<Instruction>(V);
- if (!I)
- return false;
-
- if (!isa<BinaryOperator>(I))
- P = nullptr;
- // Try to match and vectorize a horizontal reduction.
- return canBeVectorized(P, I, BB, R, TTI,
- [this](BinaryOperator *BI, BoUpSLP &R) -> bool {
- return tryToVectorize(BI, R);
- });
-}
-
bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
bool Changed = false;
SmallVector<Value *, 4> Incoming;
SmallSet<Value *, 16> VisitedInstrs;
bool HaveVectorizedPhiNodes = true;
while (HaveVectorizedPhiNodes) {
HaveVectorizedPhiNodes = false;
// Collect the incoming values from the PHIs.
Incoming.clear();
for (Instruction &I : *BB) {
PHINode *P = dyn_cast<PHINode>(&I);
if (!P)
break;
if (!VisitedInstrs.count(P))
Incoming.push_back(P);
}
// Sort by type.
std::stable_sort(Incoming.begin(), Incoming.end(), PhiTypeSorterFunc);
// Try to vectorize elements base on their type.
for (SmallVector<Value *, 4>::iterator IncIt = Incoming.begin(),
E = Incoming.end();
IncIt != E;) {
// Look for the next elements with the same type.
SmallVector<Value *, 4>::iterator SameTypeIt = IncIt;
while (SameTypeIt != E &&
(*SameTypeIt)->getType() == (*IncIt)->getType()) {
VisitedInstrs.insert(*SameTypeIt);
++SameTypeIt;
}
// Try to vectorize them.
unsigned NumElts = (SameTypeIt - IncIt);
DEBUG(errs() << "SLP: Trying to vectorize starting at PHIs (" << NumElts << ")\n");
if (NumElts > 1 && tryToVectorizeList(makeArrayRef(IncIt, NumElts), R)) {
// Success start over because instructions might have been changed.
HaveVectorizedPhiNodes = true;
Changed = true;
break;
}
// Start over at the next instruction of a different type (or the end).
IncIt = SameTypeIt;
}
}
VisitedInstrs.clear();
for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; it++) {
// We may go through BB multiple times so skip the one we have checked.
if (!VisitedInstrs.insert(&*it).second)
continue;
if (isa<DbgInfoIntrinsic>(it))
continue;
// Try to vectorize reductions that use PHINodes.
if (PHINode *P = dyn_cast<PHINode>(it)) {
// Check that the PHI is a reduction PHI.
if (P->getNumIncomingValues() != 2)
return Changed;
+ Value *Rdx = getReductionValue(DT, P, BB, LI);
+
+ // Check if this is a Binary Operator.
+ BinaryOperator *BI = dyn_cast_or_null<BinaryOperator>(Rdx);
+ if (!BI)
+ continue;
+
// Try to match and vectorize a horizontal reduction.
- if (vectorizeRootInstruction(P, getReductionValue(DT, P, BB, LI), BB, R,
- TTI)) {
+ if (canMatchHorizontalReduction(P, BI, R, TTI, R.getMinVecRegSize())) {
Changed = true;
it = BB->begin();
e = BB->end();
continue;
}
+
+ Value *Inst = BI->getOperand(0);
+ if (Inst == P)
+ Inst = BI->getOperand(1);
+
+ if (tryToVectorize(dyn_cast<BinaryOperator>(Inst), R)) {
+ // We would like to start over since some instructions are deleted
+ // and the iterator may become invalid value.
+ Changed = true;
+ it = BB->begin();
+ e = BB->end();
+ continue;
+ }
+
continue;
}
- if (ShouldStartVectorizeHorAtStore) {
- if (StoreInst *SI = dyn_cast<StoreInst>(it)) {
- // Try to match and vectorize a horizontal reduction.
- if (vectorizeRootInstruction(nullptr, SI->getValueOperand(), BB, R,
- TTI)) {
- Changed = true;
- it = BB->begin();
- e = BB->end();
- continue;
+ if (ShouldStartVectorizeHorAtStore)
+ if (StoreInst *SI = dyn_cast<StoreInst>(it))
+ if (BinaryOperator *BinOp =
+ dyn_cast<BinaryOperator>(SI->getValueOperand())) {
+ if (canMatchHorizontalReduction(nullptr, BinOp, R, TTI,
+ R.getMinVecRegSize()) ||
+ tryToVectorize(BinOp, R)) {
+ Changed = true;
+ it = BB->begin();
+ e = BB->end();
+ continue;
+ }
}
- }
- }
// Try to vectorize horizontal reductions feeding into a return.
- if (ReturnInst *RI = dyn_cast<ReturnInst>(it)) {
- if (RI->getNumOperands() != 0) {
- // Try to match and vectorize a horizontal reduction.
- if (vectorizeRootInstruction(nullptr, RI->getOperand(0), BB, R, TTI)) {
- Changed = true;
- it = BB->begin();
- e = BB->end();
- continue;
+ if (ReturnInst *RI = dyn_cast<ReturnInst>(it))
+ if (RI->getNumOperands() != 0)
+ if (BinaryOperator *BinOp =
+ dyn_cast<BinaryOperator>(RI->getOperand(0))) {
+ DEBUG(dbgs() << "SLP: Found a return to vectorize.\n");
+ if (canMatchHorizontalReduction(nullptr, BinOp, R, TTI,
+ R.getMinVecRegSize()) ||
+ tryToVectorizePair(BinOp->getOperand(0), BinOp->getOperand(1),
+ R)) {
+ Changed = true;
+ it = BB->begin();
+ e = BB->end();
+ continue;
+ }
}
- }
- }
// Try to vectorize trees that start at compare instructions.
if (CmpInst *CI = dyn_cast<CmpInst>(it)) {
if (tryToVectorizePair(CI->getOperand(0), CI->getOperand(1), R)) {
Changed = true;
// We would like to start over since some instructions are deleted
// and the iterator may become invalid value.
it = BB->begin();
e = BB->end();
continue;
}
- for (int I = 0; I < 2; ++I) {
- if (vectorizeRootInstruction(nullptr, CI->getOperand(I), BB, R, TTI)) {
- Changed = true;
- // We would like to start over since some instructions are deleted
- // and the iterator may become invalid value.
- it = BB->begin();
- e = BB->end();
- break;
+ for (int i = 0; i < 2; ++i) {
+ if (BinaryOperator *BI = dyn_cast<BinaryOperator>(CI->getOperand(i))) {
+ if (tryToVectorizePair(BI->getOperand(0), BI->getOperand(1), R)) {
+ Changed = true;
+ // We would like to start over since some instructions are deleted
+ // and the iterator may become invalid value.
+ it = BB->begin();
+ e = BB->end();
+ break;
+ }
}
}
continue;
}
// Try to vectorize trees that start at insertelement instructions.
if (InsertElementInst *FirstInsertElem = dyn_cast<InsertElementInst>(it)) {
SmallVector<Value *, 16> BuildVector;
SmallVector<Value *, 16> BuildVectorOpds;
if (!findBuildVector(FirstInsertElem, BuildVector, BuildVectorOpds))
continue;
// Vectorize starting with the build vector operands ignoring the
// BuildVector instructions for the purpose of scheduling and user
// extraction.
if (tryToVectorizeList(BuildVectorOpds, R, BuildVector)) {
Changed = true;
it = BB->begin();
e = BB->end();
}
continue;
}
// Try to vectorize trees that start at insertvalue instructions feeding into
// a store.
if (StoreInst *SI = dyn_cast<StoreInst>(it)) {
if (InsertValueInst *LastInsertValue = dyn_cast<InsertValueInst>(SI->getValueOperand())) {
const DataLayout &DL = BB->getModule()->getDataLayout();
if (R.canMapToVector(SI->getValueOperand()->getType(), DL)) {
SmallVector<Value *, 16> BuildVector;
SmallVector<Value *, 16> BuildVectorOpds;
if (!findBuildAggregate(LastInsertValue, BuildVector, BuildVectorOpds))
continue;
DEBUG(dbgs() << "SLP: store of array mappable to vector: " << *SI << "\n");
if (tryToVectorizeList(BuildVectorOpds, R, BuildVector, false)) {
Changed = true;
it = BB->begin();
e = BB->end();
}
continue;
}
}
}
}
return Changed;
}
bool SLPVectorizerPass::vectorizeGEPIndices(BasicBlock *BB, BoUpSLP &R) {
auto Changed = false;
for (auto &Entry : GEPs) {
// If the getelementptr list has fewer than two elements, there's nothing
// to do.
if (Entry.second.size() < 2)
continue;
DEBUG(dbgs() << "SLP: Analyzing a getelementptr list of length "
<< Entry.second.size() << ".\n");
// We process the getelementptr list in chunks of 16 (like we do for
// stores) to minimize compile-time.
for (unsigned BI = 0, BE = Entry.second.size(); BI < BE; BI += 16) {
auto Len = std::min<unsigned>(BE - BI, 16);
auto GEPList = makeArrayRef(&Entry.second[BI], Len);
// Initialize a set a candidate getelementptrs. Note that we use a
// SetVector here to preserve program order. If the index computations
// are vectorizable and begin with loads, we want to minimize the chance
// of having to reorder them later.
SetVector<Value *> Candidates(GEPList.begin(), GEPList.end());
// Some of the candidates may have already been vectorized after we
// initially collected them. If so, the WeakVHs will have nullified the
// values, so remove them from the set of candidates.
Candidates.remove(nullptr);
// Remove from the set of candidates all pairs of getelementptrs with
// constant differences. Such getelementptrs are likely not good
// candidates for vectorization in a bottom-up phase since one can be
// computed from the other. We also ensure all candidate getelementptr
// indices are unique.
for (int I = 0, E = GEPList.size(); I < E && Candidates.size() > 1; ++I) {
auto *GEPI = cast<GetElementPtrInst>(GEPList[I]);
if (!Candidates.count(GEPI))
continue;
auto *SCEVI = SE->getSCEV(GEPList[I]);
for (int J = I + 1; J < E && Candidates.size() > 1; ++J) {
auto *GEPJ = cast<GetElementPtrInst>(GEPList[J]);
auto *SCEVJ = SE->getSCEV(GEPList[J]);
if (isa<SCEVConstant>(SE->getMinusSCEV(SCEVI, SCEVJ))) {
Candidates.remove(GEPList[I]);
Candidates.remove(GEPList[J]);
} else if (GEPI->idx_begin()->get() == GEPJ->idx_begin()->get()) {
Candidates.remove(GEPList[J]);
}
}
}
// We break out of the above computation as soon as we know there are
// fewer than two candidates remaining.
if (Candidates.size() < 2)
continue;
// Add the single, non-constant index of each candidate to the bundle. We
// ensured the indices met these constraints when we originally collected
// the getelementptrs.
SmallVector<Value *, 16> Bundle(Candidates.size());
auto BundleIndex = 0u;
for (auto *V : Candidates) {
auto *GEP = cast<GetElementPtrInst>(V);
auto *GEPIdx = GEP->idx_begin()->get();
assert(GEP->getNumIndices() == 1 || !isa<Constant>(GEPIdx));
Bundle[BundleIndex++] = GEPIdx;
}
// Try and vectorize the indices. We are currently only interested in
// gather-like cases of the form:
//
// ... = g[a[0] - b[0]] + g[a[1] - b[1]] + ...
//
// where the loads of "a", the loads of "b", and the subtractions can be
// performed in parallel. It's likely that detecting this pattern in a
// bottom-up phase will be simpler and less costly than building a
// full-blown top-down phase beginning at the consecutive loads.
Changed |= tryToVectorizeList(Bundle, R);
}
}
return Changed;
}
bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) {
bool Changed = false;
// Attempt to sort and vectorize each of the store-groups.
for (StoreListMap::iterator it = Stores.begin(), e = Stores.end(); it != e;
++it) {
if (it->second.size() < 2)
continue;
DEBUG(dbgs() << "SLP: Analyzing a store chain of length "
<< it->second.size() << ".\n");
// Process the stores in chunks of 16.
// TODO: The limit of 16 inhibits greater vectorization factors.
// For example, AVX2 supports v32i8. Increasing this limit, however,
// may cause a significant compile-time increase.
for (unsigned CI = 0, CE = it->second.size(); CI < CE; CI+=16) {
unsigned Len = std::min<unsigned>(CE - CI, 16);
Changed |= vectorizeStores(makeArrayRef(&it->second[CI], Len), R);
}
}
return Changed;
}
char SLPVectorizer::ID = 0;
static const char lv_name[] = "SLP Vectorizer";
INITIALIZE_PASS_BEGIN(SLPVectorizer, SV_NAME, lv_name, false, false)
INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)
INITIALIZE_PASS_END(SLPVectorizer, SV_NAME, lv_name, false, false)
namespace llvm {
Pass *createSLPVectorizerPass() { return new SLPVectorizer(); }
}
Index: projects/clang400-import/contrib/llvm/tools/clang/lib/CodeGen/CGOpenMPRuntime.cpp
===================================================================
--- projects/clang400-import/contrib/llvm/tools/clang/lib/CodeGen/CGOpenMPRuntime.cpp (revision 314268)
+++ projects/clang400-import/contrib/llvm/tools/clang/lib/CodeGen/CGOpenMPRuntime.cpp (revision 314269)
@@ -1,6795 +1,6797 @@
//===----- CGOpenMPRuntime.cpp - Interface to OpenMP Runtimes -------------===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
// This provides a class for OpenMP runtime code generation.
//
//===----------------------------------------------------------------------===//
#include "CGCXXABI.h"
#include "CGCleanup.h"
#include "CGOpenMPRuntime.h"
#include "CodeGenFunction.h"
#include "ConstantBuilder.h"
#include "clang/AST/Decl.h"
#include "clang/AST/StmtOpenMP.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/Bitcode/BitcodeReader.h"
#include "llvm/IR/CallSite.h"
#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/GlobalValue.h"
#include "llvm/IR/Value.h"
#include "llvm/Support/Format.h"
#include "llvm/Support/raw_ostream.h"
#include <cassert>
using namespace clang;
using namespace CodeGen;
namespace {
/// \brief Base class for handling code generation inside OpenMP regions.
class CGOpenMPRegionInfo : public CodeGenFunction::CGCapturedStmtInfo {
public:
/// \brief Kinds of OpenMP regions used in codegen.
enum CGOpenMPRegionKind {
/// \brief Region with outlined function for standalone 'parallel'
/// directive.
ParallelOutlinedRegion,
/// \brief Region with outlined function for standalone 'task' directive.
TaskOutlinedRegion,
/// \brief Region for constructs that do not require function outlining,
/// like 'for', 'sections', 'atomic' etc. directives.
InlinedRegion,
/// \brief Region with outlined function for standalone 'target' directive.
TargetRegion,
};
CGOpenMPRegionInfo(const CapturedStmt &CS,
const CGOpenMPRegionKind RegionKind,
const RegionCodeGenTy &CodeGen, OpenMPDirectiveKind Kind,
bool HasCancel)
: CGCapturedStmtInfo(CS, CR_OpenMP), RegionKind(RegionKind),
CodeGen(CodeGen), Kind(Kind), HasCancel(HasCancel) {}
CGOpenMPRegionInfo(const CGOpenMPRegionKind RegionKind,
const RegionCodeGenTy &CodeGen, OpenMPDirectiveKind Kind,
bool HasCancel)
: CGCapturedStmtInfo(CR_OpenMP), RegionKind(RegionKind), CodeGen(CodeGen),
Kind(Kind), HasCancel(HasCancel) {}
/// \brief Get a variable or parameter for storing global thread id
/// inside OpenMP construct.
virtual const VarDecl *getThreadIDVariable() const = 0;
/// \brief Emit the captured statement body.
void EmitBody(CodeGenFunction &CGF, const Stmt *S) override;
/// \brief Get an LValue for the current ThreadID variable.
/// \return LValue for thread id variable. This LValue always has type int32*.
virtual LValue getThreadIDVariableLValue(CodeGenFunction &CGF);
virtual void emitUntiedSwitch(CodeGenFunction & /*CGF*/) {}
CGOpenMPRegionKind getRegionKind() const { return RegionKind; }
OpenMPDirectiveKind getDirectiveKind() const { return Kind; }
bool hasCancel() const { return HasCancel; }
static bool classof(const CGCapturedStmtInfo *Info) {
return Info->getKind() == CR_OpenMP;
}
~CGOpenMPRegionInfo() override = default;
protected:
CGOpenMPRegionKind RegionKind;
RegionCodeGenTy CodeGen;
OpenMPDirectiveKind Kind;
bool HasCancel;
};
/// \brief API for captured statement code generation in OpenMP constructs.
class CGOpenMPOutlinedRegionInfo final : public CGOpenMPRegionInfo {
public:
CGOpenMPOutlinedRegionInfo(const CapturedStmt &CS, const VarDecl *ThreadIDVar,
const RegionCodeGenTy &CodeGen,
OpenMPDirectiveKind Kind, bool HasCancel,
StringRef HelperName)
: CGOpenMPRegionInfo(CS, ParallelOutlinedRegion, CodeGen, Kind,
HasCancel),
ThreadIDVar(ThreadIDVar), HelperName(HelperName) {
assert(ThreadIDVar != nullptr && "No ThreadID in OpenMP region.");
}
/// \brief Get a variable or parameter for storing global thread id
/// inside OpenMP construct.
const VarDecl *getThreadIDVariable() const override { return ThreadIDVar; }
/// \brief Get the name of the capture helper.
StringRef getHelperName() const override { return HelperName; }
static bool classof(const CGCapturedStmtInfo *Info) {
return CGOpenMPRegionInfo::classof(Info) &&
cast<CGOpenMPRegionInfo>(Info)->getRegionKind() ==
ParallelOutlinedRegion;
}
private:
/// \brief A variable or parameter storing global thread id for OpenMP
/// constructs.
const VarDecl *ThreadIDVar;
StringRef HelperName;
};
/// \brief API for captured statement code generation in OpenMP constructs.
class CGOpenMPTaskOutlinedRegionInfo final : public CGOpenMPRegionInfo {
public:
class UntiedTaskActionTy final : public PrePostActionTy {
bool Untied;
const VarDecl *PartIDVar;
const RegionCodeGenTy UntiedCodeGen;
llvm::SwitchInst *UntiedSwitch = nullptr;
public:
UntiedTaskActionTy(bool Tied, const VarDecl *PartIDVar,
const RegionCodeGenTy &UntiedCodeGen)
: Untied(!Tied), PartIDVar(PartIDVar), UntiedCodeGen(UntiedCodeGen) {}
void Enter(CodeGenFunction &CGF) override {
if (Untied) {
// Emit task switching point.
auto PartIdLVal = CGF.EmitLoadOfPointerLValue(
CGF.GetAddrOfLocalVar(PartIDVar),
PartIDVar->getType()->castAs<PointerType>());
auto *Res = CGF.EmitLoadOfScalar(PartIdLVal, SourceLocation());
auto *DoneBB = CGF.createBasicBlock(".untied.done.");
UntiedSwitch = CGF.Builder.CreateSwitch(Res, DoneBB);
CGF.EmitBlock(DoneBB);
CGF.EmitBranchThroughCleanup(CGF.ReturnBlock);
CGF.EmitBlock(CGF.createBasicBlock(".untied.jmp."));
UntiedSwitch->addCase(CGF.Builder.getInt32(0),
CGF.Builder.GetInsertBlock());
emitUntiedSwitch(CGF);
}
}
void emitUntiedSwitch(CodeGenFunction &CGF) const {
if (Untied) {
auto PartIdLVal = CGF.EmitLoadOfPointerLValue(
CGF.GetAddrOfLocalVar(PartIDVar),
PartIDVar->getType()->castAs<PointerType>());
CGF.EmitStoreOfScalar(CGF.Builder.getInt32(UntiedSwitch->getNumCases()),
PartIdLVal);
UntiedCodeGen(CGF);
CodeGenFunction::JumpDest CurPoint =
CGF.getJumpDestInCurrentScope(".untied.next.");
CGF.EmitBranchThroughCleanup(CGF.ReturnBlock);
CGF.EmitBlock(CGF.createBasicBlock(".untied.jmp."));
UntiedSwitch->addCase(CGF.Builder.getInt32(UntiedSwitch->getNumCases()),
CGF.Builder.GetInsertBlock());
CGF.EmitBranchThroughCleanup(CurPoint);
CGF.EmitBlock(CurPoint.getBlock());
}
}
unsigned getNumberOfParts() const { return UntiedSwitch->getNumCases(); }
};
CGOpenMPTaskOutlinedRegionInfo(const CapturedStmt &CS,
const VarDecl *ThreadIDVar,
const RegionCodeGenTy &CodeGen,
OpenMPDirectiveKind Kind, bool HasCancel,
const UntiedTaskActionTy &Action)
: CGOpenMPRegionInfo(CS, TaskOutlinedRegion, CodeGen, Kind, HasCancel),
ThreadIDVar(ThreadIDVar), Action(Action) {
assert(ThreadIDVar != nullptr && "No ThreadID in OpenMP region.");
}
/// \brief Get a variable or parameter for storing global thread id
/// inside OpenMP construct.
const VarDecl *getThreadIDVariable() const override { return ThreadIDVar; }
/// \brief Get an LValue for the current ThreadID variable.
LValue getThreadIDVariableLValue(CodeGenFunction &CGF) override;
/// \brief Get the name of the capture helper.
StringRef getHelperName() const override { return ".omp_outlined."; }
void emitUntiedSwitch(CodeGenFunction &CGF) override {
Action.emitUntiedSwitch(CGF);
}
static bool classof(const CGCapturedStmtInfo *Info) {
return CGOpenMPRegionInfo::classof(Info) &&
cast<CGOpenMPRegionInfo>(Info)->getRegionKind() ==
TaskOutlinedRegion;
}
private:
/// \brief A variable or parameter storing global thread id for OpenMP
/// constructs.
const VarDecl *ThreadIDVar;
/// Action for emitting code for untied tasks.
const UntiedTaskActionTy &Action;
};
/// \brief API for inlined captured statement code generation in OpenMP
/// constructs.
class CGOpenMPInlinedRegionInfo : public CGOpenMPRegionInfo {
public:
CGOpenMPInlinedRegionInfo(CodeGenFunction::CGCapturedStmtInfo *OldCSI,
const RegionCodeGenTy &CodeGen,
OpenMPDirectiveKind Kind, bool HasCancel)
: CGOpenMPRegionInfo(InlinedRegion, CodeGen, Kind, HasCancel),
OldCSI(OldCSI),
OuterRegionInfo(dyn_cast_or_null<CGOpenMPRegionInfo>(OldCSI)) {}
// \brief Retrieve the value of the context parameter.
llvm::Value *getContextValue() const override {
if (OuterRegionInfo)
return OuterRegionInfo->getContextValue();
llvm_unreachable("No context value for inlined OpenMP region");
}
void setContextValue(llvm::Value *V) override {
if (OuterRegionInfo) {
OuterRegionInfo->setContextValue(V);
return;
}
llvm_unreachable("No context value for inlined OpenMP region");
}
/// \brief Lookup the captured field decl for a variable.
const FieldDecl *lookup(const VarDecl *VD) const override {
if (OuterRegionInfo)
return OuterRegionInfo->lookup(VD);
// If there is no outer outlined region,no need to lookup in a list of
// captured variables, we can use the original one.
return nullptr;
}
FieldDecl *getThisFieldDecl() const override {
if (OuterRegionInfo)
return OuterRegionInfo->getThisFieldDecl();
return nullptr;
}
/// \brief Get a variable or parameter for storing global thread id
/// inside OpenMP construct.
const VarDecl *getThreadIDVariable() const override {
if (OuterRegionInfo)
return OuterRegionInfo->getThreadIDVariable();
return nullptr;
}
/// \brief Get the name of the capture helper.
StringRef getHelperName() const override {
if (auto *OuterRegionInfo = getOldCSI())
return OuterRegionInfo->getHelperName();
llvm_unreachable("No helper name for inlined OpenMP construct");
}
void emitUntiedSwitch(CodeGenFunction &CGF) override {
if (OuterRegionInfo)
OuterRegionInfo->emitUntiedSwitch(CGF);
}
CodeGenFunction::CGCapturedStmtInfo *getOldCSI() const { return OldCSI; }
static bool classof(const CGCapturedStmtInfo *Info) {
return CGOpenMPRegionInfo::classof(Info) &&
cast<CGOpenMPRegionInfo>(Info)->getRegionKind() == InlinedRegion;
}
~CGOpenMPInlinedRegionInfo() override = default;
private:
/// \brief CodeGen info about outer OpenMP region.
CodeGenFunction::CGCapturedStmtInfo *OldCSI;
CGOpenMPRegionInfo *OuterRegionInfo;
};
/// \brief API for captured statement code generation in OpenMP target
/// constructs. For this captures, implicit parameters are used instead of the
/// captured fields. The name of the target region has to be unique in a given
/// application so it is provided by the client, because only the client has
/// the information to generate that.
class CGOpenMPTargetRegionInfo final : public CGOpenMPRegionInfo {
public:
CGOpenMPTargetRegionInfo(const CapturedStmt &CS,
const RegionCodeGenTy &CodeGen, StringRef HelperName)
: CGOpenMPRegionInfo(CS, TargetRegion, CodeGen, OMPD_target,
/*HasCancel=*/false),
HelperName(HelperName) {}
/// \brief This is unused for target regions because each starts executing
/// with a single thread.
const VarDecl *getThreadIDVariable() const override { return nullptr; }
/// \brief Get the name of the capture helper.
StringRef getHelperName() const override { return HelperName; }
static bool classof(const CGCapturedStmtInfo *Info) {
return CGOpenMPRegionInfo::classof(Info) &&
cast<CGOpenMPRegionInfo>(Info)->getRegionKind() == TargetRegion;
}
private:
StringRef HelperName;
};
static void EmptyCodeGen(CodeGenFunction &, PrePostActionTy &) {
llvm_unreachable("No codegen for expressions");
}
/// \brief API for generation of expressions captured in a innermost OpenMP
/// region.
class CGOpenMPInnerExprInfo final : public CGOpenMPInlinedRegionInfo {
public:
CGOpenMPInnerExprInfo(CodeGenFunction &CGF, const CapturedStmt &CS)
: CGOpenMPInlinedRegionInfo(CGF.CapturedStmtInfo, EmptyCodeGen,
OMPD_unknown,
/*HasCancel=*/false),
PrivScope(CGF) {
// Make sure the globals captured in the provided statement are local by
// using the privatization logic. We assume the same variable is not
// captured more than once.
for (auto &C : CS.captures()) {
if (!C.capturesVariable() && !C.capturesVariableByCopy())
continue;
const VarDecl *VD = C.getCapturedVar();
if (VD->isLocalVarDeclOrParm())
continue;
DeclRefExpr DRE(const_cast<VarDecl *>(VD),
/*RefersToEnclosingVariableOrCapture=*/false,
VD->getType().getNonReferenceType(), VK_LValue,
SourceLocation());
PrivScope.addPrivate(VD, [&CGF, &DRE]() -> Address {
return CGF.EmitLValue(&DRE).getAddress();
});
}
(void)PrivScope.Privatize();
}
/// \brief Lookup the captured field decl for a variable.
const FieldDecl *lookup(const VarDecl *VD) const override {
if (auto *FD = CGOpenMPInlinedRegionInfo::lookup(VD))
return FD;
return nullptr;
}
/// \brief Emit the captured statement body.
void EmitBody(CodeGenFunction &CGF, const Stmt *S) override {
llvm_unreachable("No body for expressions");
}
/// \brief Get a variable or parameter for storing global thread id
/// inside OpenMP construct.
const VarDecl *getThreadIDVariable() const override {
llvm_unreachable("No thread id for expressions");
}
/// \brief Get the name of the capture helper.
StringRef getHelperName() const override {
llvm_unreachable("No helper name for expressions");
}
static bool classof(const CGCapturedStmtInfo *Info) { return false; }
private:
/// Private scope to capture global variables.
CodeGenFunction::OMPPrivateScope PrivScope;
};
/// \brief RAII for emitting code of OpenMP constructs.
class InlinedOpenMPRegionRAII {
CodeGenFunction &CGF;
llvm::DenseMap<const VarDecl *, FieldDecl *> LambdaCaptureFields;
FieldDecl *LambdaThisCaptureField = nullptr;
public:
/// \brief Constructs region for combined constructs.
/// \param CodeGen Code generation sequence for combined directives. Includes
/// a list of functions used for code generation of implicitly inlined
/// regions.
InlinedOpenMPRegionRAII(CodeGenFunction &CGF, const RegionCodeGenTy &CodeGen,
OpenMPDirectiveKind Kind, bool HasCancel)
: CGF(CGF) {
// Start emission for the construct.
CGF.CapturedStmtInfo = new CGOpenMPInlinedRegionInfo(
CGF.CapturedStmtInfo, CodeGen, Kind, HasCancel);
std::swap(CGF.LambdaCaptureFields, LambdaCaptureFields);
LambdaThisCaptureField = CGF.LambdaThisCaptureField;
CGF.LambdaThisCaptureField = nullptr;
}
~InlinedOpenMPRegionRAII() {
// Restore original CapturedStmtInfo only if we're done with code emission.
auto *OldCSI =
cast<CGOpenMPInlinedRegionInfo>(CGF.CapturedStmtInfo)->getOldCSI();
delete CGF.CapturedStmtInfo;
CGF.CapturedStmtInfo = OldCSI;
std::swap(CGF.LambdaCaptureFields, LambdaCaptureFields);
CGF.LambdaThisCaptureField = LambdaThisCaptureField;
}
};
/// \brief Values for bit flags used in the ident_t to describe the fields.
/// All enumeric elements are named and described in accordance with the code
/// from http://llvm.org/svn/llvm-project/openmp/trunk/runtime/src/kmp.h
enum OpenMPLocationFlags {
/// \brief Use trampoline for internal microtask.
OMP_IDENT_IMD = 0x01,
/// \brief Use c-style ident structure.
OMP_IDENT_KMPC = 0x02,
/// \brief Atomic reduction option for kmpc_reduce.
OMP_ATOMIC_REDUCE = 0x10,
/// \brief Explicit 'barrier' directive.
OMP_IDENT_BARRIER_EXPL = 0x20,
/// \brief Implicit barrier in code.
OMP_IDENT_BARRIER_IMPL = 0x40,
/// \brief Implicit barrier in 'for' directive.
OMP_IDENT_BARRIER_IMPL_FOR = 0x40,
/// \brief Implicit barrier in 'sections' directive.
OMP_IDENT_BARRIER_IMPL_SECTIONS = 0xC0,
/// \brief Implicit barrier in 'single' directive.
OMP_IDENT_BARRIER_IMPL_SINGLE = 0x140
};
/// \brief Describes ident structure that describes a source location.
/// All descriptions are taken from
/// http://llvm.org/svn/llvm-project/openmp/trunk/runtime/src/kmp.h
/// Original structure:
/// typedef struct ident {
/// kmp_int32 reserved_1; /**< might be used in Fortran;
/// see above */
/// kmp_int32 flags; /**< also f.flags; KMP_IDENT_xxx flags;
/// KMP_IDENT_KMPC identifies this union
/// member */
/// kmp_int32 reserved_2; /**< not really used in Fortran any more;
/// see above */
///#if USE_ITT_BUILD
/// /* but currently used for storing
/// region-specific ITT */
/// /* contextual information. */
///#endif /* USE_ITT_BUILD */
/// kmp_int32 reserved_3; /**< source[4] in Fortran, do not use for
/// C++ */
/// char const *psource; /**< String describing the source location.
/// The string is composed of semi-colon separated
// fields which describe the source file,
/// the function and a pair of line numbers that
/// delimit the construct.
/// */
/// } ident_t;
enum IdentFieldIndex {
/// \brief might be used in Fortran
IdentField_Reserved_1,
/// \brief OMP_IDENT_xxx flags; OMP_IDENT_KMPC identifies this union member.
IdentField_Flags,
/// \brief Not really used in Fortran any more
IdentField_Reserved_2,
/// \brief Source[4] in Fortran, do not use for C++
IdentField_Reserved_3,
/// \brief String describing the source location. The string is composed of
/// semi-colon separated fields which describe the source file, the function
/// and a pair of line numbers that delimit the construct.
IdentField_PSource
};
/// \brief Schedule types for 'omp for' loops (these enumerators are taken from
/// the enum sched_type in kmp.h).
enum OpenMPSchedType {
/// \brief Lower bound for default (unordered) versions.
OMP_sch_lower = 32,
OMP_sch_static_chunked = 33,
OMP_sch_static = 34,
OMP_sch_dynamic_chunked = 35,
OMP_sch_guided_chunked = 36,
OMP_sch_runtime = 37,
OMP_sch_auto = 38,
/// static with chunk adjustment (e.g., simd)
OMP_sch_static_balanced_chunked = 45,
/// \brief Lower bound for 'ordered' versions.
OMP_ord_lower = 64,
OMP_ord_static_chunked = 65,
OMP_ord_static = 66,
OMP_ord_dynamic_chunked = 67,
OMP_ord_guided_chunked = 68,
OMP_ord_runtime = 69,
OMP_ord_auto = 70,
OMP_sch_default = OMP_sch_static,
/// \brief dist_schedule types
OMP_dist_sch_static_chunked = 91,
OMP_dist_sch_static = 92,
/// Support for OpenMP 4.5 monotonic and nonmonotonic schedule modifiers.
/// Set if the monotonic schedule modifier was present.
OMP_sch_modifier_monotonic = (1 << 29),
/// Set if the nonmonotonic schedule modifier was present.
OMP_sch_modifier_nonmonotonic = (1 << 30),
};
enum OpenMPRTLFunction {
/// \brief Call to void __kmpc_fork_call(ident_t *loc, kmp_int32 argc,
/// kmpc_micro microtask, ...);
OMPRTL__kmpc_fork_call,
/// \brief Call to void *__kmpc_threadprivate_cached(ident_t *loc,
/// kmp_int32 global_tid, void *data, size_t size, void ***cache);
OMPRTL__kmpc_threadprivate_cached,
/// \brief Call to void __kmpc_threadprivate_register( ident_t *,
/// void *data, kmpc_ctor ctor, kmpc_cctor cctor, kmpc_dtor dtor);
OMPRTL__kmpc_threadprivate_register,
// Call to __kmpc_int32 kmpc_global_thread_num(ident_t *loc);
OMPRTL__kmpc_global_thread_num,
// Call to void __kmpc_critical(ident_t *loc, kmp_int32 global_tid,
// kmp_critical_name *crit);
OMPRTL__kmpc_critical,
// Call to void __kmpc_critical_with_hint(ident_t *loc, kmp_int32
// global_tid, kmp_critical_name *crit, uintptr_t hint);
OMPRTL__kmpc_critical_with_hint,
// Call to void __kmpc_end_critical(ident_t *loc, kmp_int32 global_tid,
// kmp_critical_name *crit);
OMPRTL__kmpc_end_critical,
// Call to kmp_int32 __kmpc_cancel_barrier(ident_t *loc, kmp_int32
// global_tid);
OMPRTL__kmpc_cancel_barrier,
// Call to void __kmpc_barrier(ident_t *loc, kmp_int32 global_tid);
OMPRTL__kmpc_barrier,
// Call to void __kmpc_for_static_fini(ident_t *loc, kmp_int32 global_tid);
OMPRTL__kmpc_for_static_fini,
// Call to void __kmpc_serialized_parallel(ident_t *loc, kmp_int32
// global_tid);
OMPRTL__kmpc_serialized_parallel,
// Call to void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32
// global_tid);
OMPRTL__kmpc_end_serialized_parallel,
// Call to void __kmpc_push_num_threads(ident_t *loc, kmp_int32 global_tid,
// kmp_int32 num_threads);
OMPRTL__kmpc_push_num_threads,
// Call to void __kmpc_flush(ident_t *loc);
OMPRTL__kmpc_flush,
// Call to kmp_int32 __kmpc_master(ident_t *, kmp_int32 global_tid);
OMPRTL__kmpc_master,
// Call to void __kmpc_end_master(ident_t *, kmp_int32 global_tid);
OMPRTL__kmpc_end_master,
// Call to kmp_int32 __kmpc_omp_taskyield(ident_t *, kmp_int32 global_tid,
// int end_part);
OMPRTL__kmpc_omp_taskyield,
// Call to kmp_int32 __kmpc_single(ident_t *, kmp_int32 global_tid);
OMPRTL__kmpc_single,
// Call to void __kmpc_end_single(ident_t *, kmp_int32 global_tid);
OMPRTL__kmpc_end_single,
// Call to kmp_task_t * __kmpc_omp_task_alloc(ident_t *, kmp_int32 gtid,
// kmp_int32 flags, size_t sizeof_kmp_task_t, size_t sizeof_shareds,
// kmp_routine_entry_t *task_entry);
OMPRTL__kmpc_omp_task_alloc,
// Call to kmp_int32 __kmpc_omp_task(ident_t *, kmp_int32 gtid, kmp_task_t *
// new_task);
OMPRTL__kmpc_omp_task,
// Call to void __kmpc_copyprivate(ident_t *loc, kmp_int32 global_tid,
// size_t cpy_size, void *cpy_data, void(*cpy_func)(void *, void *),
// kmp_int32 didit);
OMPRTL__kmpc_copyprivate,
// Call to kmp_int32 __kmpc_reduce(ident_t *loc, kmp_int32 global_tid,
// kmp_int32 num_vars, size_t reduce_size, void *reduce_data, void
// (*reduce_func)(void *lhs_data, void *rhs_data), kmp_critical_name *lck);
OMPRTL__kmpc_reduce,
// Call to kmp_int32 __kmpc_reduce_nowait(ident_t *loc, kmp_int32
// global_tid, kmp_int32 num_vars, size_t reduce_size, void *reduce_data,
// void (*reduce_func)(void *lhs_data, void *rhs_data), kmp_critical_name
// *lck);
OMPRTL__kmpc_reduce_nowait,
// Call to void __kmpc_end_reduce(ident_t *loc, kmp_int32 global_tid,
// kmp_critical_name *lck);
OMPRTL__kmpc_end_reduce,
// Call to void __kmpc_end_reduce_nowait(ident_t *loc, kmp_int32 global_tid,
// kmp_critical_name *lck);
OMPRTL__kmpc_end_reduce_nowait,
// Call to void __kmpc_omp_task_begin_if0(ident_t *, kmp_int32 gtid,
// kmp_task_t * new_task);
OMPRTL__kmpc_omp_task_begin_if0,
// Call to void __kmpc_omp_task_complete_if0(ident_t *, kmp_int32 gtid,
// kmp_task_t * new_task);
OMPRTL__kmpc_omp_task_complete_if0,
// Call to void __kmpc_ordered(ident_t *loc, kmp_int32 global_tid);
OMPRTL__kmpc_ordered,
// Call to void __kmpc_end_ordered(ident_t *loc, kmp_int32 global_tid);
OMPRTL__kmpc_end_ordered,
// Call to kmp_int32 __kmpc_omp_taskwait(ident_t *loc, kmp_int32
// global_tid);
OMPRTL__kmpc_omp_taskwait,
// Call to void __kmpc_taskgroup(ident_t *loc, kmp_int32 global_tid);
OMPRTL__kmpc_taskgroup,
// Call to void __kmpc_end_taskgroup(ident_t *loc, kmp_int32 global_tid);
OMPRTL__kmpc_end_taskgroup,
// Call to void __kmpc_push_proc_bind(ident_t *loc, kmp_int32 global_tid,
// int proc_bind);
OMPRTL__kmpc_push_proc_bind,
// Call to kmp_int32 __kmpc_omp_task_with_deps(ident_t *loc_ref, kmp_int32
// gtid, kmp_task_t * new_task, kmp_int32 ndeps, kmp_depend_info_t
// *dep_list, kmp_int32 ndeps_noalias, kmp_depend_info_t *noalias_dep_list);
OMPRTL__kmpc_omp_task_with_deps,
// Call to void __kmpc_omp_wait_deps(ident_t *loc_ref, kmp_int32
// gtid, kmp_int32 ndeps, kmp_depend_info_t *dep_list, kmp_int32
// ndeps_noalias, kmp_depend_info_t *noalias_dep_list);
OMPRTL__kmpc_omp_wait_deps,
// Call to kmp_int32 __kmpc_cancellationpoint(ident_t *loc, kmp_int32
// global_tid, kmp_int32 cncl_kind);
OMPRTL__kmpc_cancellationpoint,
// Call to kmp_int32 __kmpc_cancel(ident_t *loc, kmp_int32 global_tid,
// kmp_int32 cncl_kind);
OMPRTL__kmpc_cancel,
// Call to void __kmpc_push_num_teams(ident_t *loc, kmp_int32 global_tid,
// kmp_int32 num_teams, kmp_int32 thread_limit);
OMPRTL__kmpc_push_num_teams,
// Call to void __kmpc_fork_teams(ident_t *loc, kmp_int32 argc, kmpc_micro
// microtask, ...);
OMPRTL__kmpc_fork_teams,
// Call to void __kmpc_taskloop(ident_t *loc, int gtid, kmp_task_t *task, int
// if_val, kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st, int nogroup, int
// sched, kmp_uint64 grainsize, void *task_dup);
OMPRTL__kmpc_taskloop,
// Call to void __kmpc_doacross_init(ident_t *loc, kmp_int32 gtid, kmp_int32
// num_dims, struct kmp_dim *dims);
OMPRTL__kmpc_doacross_init,
// Call to void __kmpc_doacross_fini(ident_t *loc, kmp_int32 gtid);
OMPRTL__kmpc_doacross_fini,
// Call to void __kmpc_doacross_post(ident_t *loc, kmp_int32 gtid, kmp_int64
// *vec);
OMPRTL__kmpc_doacross_post,
// Call to void __kmpc_doacross_wait(ident_t *loc, kmp_int32 gtid, kmp_int64
// *vec);
OMPRTL__kmpc_doacross_wait,
//
// Offloading related calls
//
// Call to int32_t __tgt_target(int32_t device_id, void *host_ptr, int32_t
// arg_num, void** args_base, void **args, size_t *arg_sizes, int32_t
// *arg_types);
OMPRTL__tgt_target,
// Call to int32_t __tgt_target_teams(int32_t device_id, void *host_ptr,
// int32_t arg_num, void** args_base, void **args, size_t *arg_sizes,
// int32_t *arg_types, int32_t num_teams, int32_t thread_limit);
OMPRTL__tgt_target_teams,
// Call to void __tgt_register_lib(__tgt_bin_desc *desc);
OMPRTL__tgt_register_lib,
// Call to void __tgt_unregister_lib(__tgt_bin_desc *desc);
OMPRTL__tgt_unregister_lib,
// Call to void __tgt_target_data_begin(int32_t device_id, int32_t arg_num,
// void** args_base, void **args, size_t *arg_sizes, int32_t *arg_types);
OMPRTL__tgt_target_data_begin,
// Call to void __tgt_target_data_end(int32_t device_id, int32_t arg_num,
// void** args_base, void **args, size_t *arg_sizes, int32_t *arg_types);
OMPRTL__tgt_target_data_end,
// Call to void __tgt_target_data_update(int32_t device_id, int32_t arg_num,
// void** args_base, void **args, size_t *arg_sizes, int32_t *arg_types);
OMPRTL__tgt_target_data_update,
};
/// A basic class for pre|post-action for advanced codegen sequence for OpenMP
/// region.
class CleanupTy final : public EHScopeStack::Cleanup {
PrePostActionTy *Action;
public:
explicit CleanupTy(PrePostActionTy *Action) : Action(Action) {}
void Emit(CodeGenFunction &CGF, Flags /*flags*/) override {
if (!CGF.HaveInsertPoint())
return;
Action->Exit(CGF);
}
};
} // anonymous namespace
void RegionCodeGenTy::operator()(CodeGenFunction &CGF) const {
CodeGenFunction::RunCleanupsScope Scope(CGF);
if (PrePostAction) {
CGF.EHStack.pushCleanup<CleanupTy>(NormalAndEHCleanup, PrePostAction);
Callback(CodeGen, CGF, *PrePostAction);
} else {
PrePostActionTy Action;
Callback(CodeGen, CGF, Action);
}
}
LValue CGOpenMPRegionInfo::getThreadIDVariableLValue(CodeGenFunction &CGF) {
return CGF.EmitLoadOfPointerLValue(
CGF.GetAddrOfLocalVar(getThreadIDVariable()),
getThreadIDVariable()->getType()->castAs<PointerType>());
}
void CGOpenMPRegionInfo::EmitBody(CodeGenFunction &CGF, const Stmt * /*S*/) {
if (!CGF.HaveInsertPoint())
return;
// 1.2.2 OpenMP Language Terminology
// Structured block - An executable statement with a single entry at the
// top and a single exit at the bottom.
// The point of exit cannot be a branch out of the structured block.
// longjmp() and throw() must not violate the entry/exit criteria.
CGF.EHStack.pushTerminate();
CodeGen(CGF);
CGF.EHStack.popTerminate();
}
LValue CGOpenMPTaskOutlinedRegionInfo::getThreadIDVariableLValue(
CodeGenFunction &CGF) {
return CGF.MakeAddrLValue(CGF.GetAddrOfLocalVar(getThreadIDVariable()),
getThreadIDVariable()->getType(),
AlignmentSource::Decl);
}
CGOpenMPRuntime::CGOpenMPRuntime(CodeGenModule &CGM)
: CGM(CGM), OffloadEntriesInfoManager(CGM) {
IdentTy = llvm::StructType::create(
"ident_t", CGM.Int32Ty /* reserved_1 */, CGM.Int32Ty /* flags */,
CGM.Int32Ty /* reserved_2 */, CGM.Int32Ty /* reserved_3 */,
CGM.Int8PtrTy /* psource */, nullptr);
KmpCriticalNameTy = llvm::ArrayType::get(CGM.Int32Ty, /*NumElements*/ 8);
loadOffloadInfoMetadata();
}
void CGOpenMPRuntime::clear() {
InternalVars.clear();
}
static llvm::Function *
emitCombinerOrInitializer(CodeGenModule &CGM, QualType Ty,
const Expr *CombinerInitializer, const VarDecl *In,
const VarDecl *Out, bool IsCombiner) {
// void .omp_combiner.(Ty *in, Ty *out);
auto &C = CGM.getContext();
QualType PtrTy = C.getPointerType(Ty).withRestrict();
FunctionArgList Args;
ImplicitParamDecl OmpOutParm(C, /*DC=*/nullptr, Out->getLocation(),
/*Id=*/nullptr, PtrTy);
ImplicitParamDecl OmpInParm(C, /*DC=*/nullptr, In->getLocation(),
/*Id=*/nullptr, PtrTy);
Args.push_back(&OmpOutParm);
Args.push_back(&OmpInParm);
auto &FnInfo =
CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
auto *FnTy = CGM.getTypes().GetFunctionType(FnInfo);
auto *Fn = llvm::Function::Create(
FnTy, llvm::GlobalValue::InternalLinkage,
IsCombiner ? ".omp_combiner." : ".omp_initializer.", &CGM.getModule());
CGM.SetInternalFunctionAttributes(/*D=*/nullptr, Fn, FnInfo);
Fn->removeFnAttr(llvm::Attribute::NoInline);
Fn->addFnAttr(llvm::Attribute::AlwaysInline);
CodeGenFunction CGF(CGM);
// Map "T omp_in;" variable to "*omp_in_parm" value in all expressions.
// Map "T omp_out;" variable to "*omp_out_parm" value in all expressions.
CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, FnInfo, Args);
CodeGenFunction::OMPPrivateScope Scope(CGF);
Address AddrIn = CGF.GetAddrOfLocalVar(&OmpInParm);
Scope.addPrivate(In, [&CGF, AddrIn, PtrTy]() -> Address {
return CGF.EmitLoadOfPointerLValue(AddrIn, PtrTy->castAs<PointerType>())
.getAddress();
});
Address AddrOut = CGF.GetAddrOfLocalVar(&OmpOutParm);
Scope.addPrivate(Out, [&CGF, AddrOut, PtrTy]() -> Address {
return CGF.EmitLoadOfPointerLValue(AddrOut, PtrTy->castAs<PointerType>())
.getAddress();
});
(void)Scope.Privatize();
CGF.EmitIgnoredExpr(CombinerInitializer);
Scope.ForceCleanup();
CGF.FinishFunction();
return Fn;
}
void CGOpenMPRuntime::emitUserDefinedReduction(
CodeGenFunction *CGF, const OMPDeclareReductionDecl *D) {
if (UDRMap.count(D) > 0)
return;
auto &C = CGM.getContext();
if (!In || !Out) {
In = &C.Idents.get("omp_in");
Out = &C.Idents.get("omp_out");
}
llvm::Function *Combiner = emitCombinerOrInitializer(
CGM, D->getType(), D->getCombiner(), cast<VarDecl>(D->lookup(In).front()),
cast<VarDecl>(D->lookup(Out).front()),
/*IsCombiner=*/true);
llvm::Function *Initializer = nullptr;
if (auto *Init = D->getInitializer()) {
if (!Priv || !Orig) {
Priv = &C.Idents.get("omp_priv");
Orig = &C.Idents.get("omp_orig");
}
Initializer = emitCombinerOrInitializer(
CGM, D->getType(), Init, cast<VarDecl>(D->lookup(Orig).front()),
cast<VarDecl>(D->lookup(Priv).front()),
/*IsCombiner=*/false);
}
UDRMap.insert(std::make_pair(D, std::make_pair(Combiner, Initializer)));
if (CGF) {
auto &Decls = FunctionUDRMap.FindAndConstruct(CGF->CurFn);
Decls.second.push_back(D);
}
}
std::pair<llvm::Function *, llvm::Function *>
CGOpenMPRuntime::getUserDefinedReduction(const OMPDeclareReductionDecl *D) {
auto I = UDRMap.find(D);
if (I != UDRMap.end())
return I->second;
emitUserDefinedReduction(/*CGF=*/nullptr, D);
return UDRMap.lookup(D);
}
// Layout information for ident_t.
static CharUnits getIdentAlign(CodeGenModule &CGM) {
return CGM.getPointerAlign();
}
static CharUnits getIdentSize(CodeGenModule &CGM) {
assert((4 * CGM.getPointerSize()).isMultipleOf(CGM.getPointerAlign()));
return CharUnits::fromQuantity(16) + CGM.getPointerSize();
}
static CharUnits getOffsetOfIdentField(IdentFieldIndex Field) {
// All the fields except the last are i32, so this works beautifully.
return unsigned(Field) * CharUnits::fromQuantity(4);
}
static Address createIdentFieldGEP(CodeGenFunction &CGF, Address Addr,
IdentFieldIndex Field,
const llvm::Twine &Name = "") {
auto Offset = getOffsetOfIdentField(Field);
return CGF.Builder.CreateStructGEP(Addr, Field, Offset, Name);
}
llvm::Value *CGOpenMPRuntime::emitParallelOrTeamsOutlinedFunction(
const OMPExecutableDirective &D, const VarDecl *ThreadIDVar,
OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) {
assert(ThreadIDVar->getType()->isPointerType() &&
"thread id variable must be of type kmp_int32 *");
const CapturedStmt *CS = cast<CapturedStmt>(D.getAssociatedStmt());
CodeGenFunction CGF(CGM, true);
bool HasCancel = false;
if (auto *OPD = dyn_cast<OMPParallelDirective>(&D))
HasCancel = OPD->hasCancel();
else if (auto *OPSD = dyn_cast<OMPParallelSectionsDirective>(&D))
HasCancel = OPSD->hasCancel();
else if (auto *OPFD = dyn_cast<OMPParallelForDirective>(&D))
HasCancel = OPFD->hasCancel();
CGOpenMPOutlinedRegionInfo CGInfo(*CS, ThreadIDVar, CodeGen, InnermostKind,
HasCancel, getOutlinedHelperName());
CodeGenFunction::CGCapturedStmtRAII CapInfoRAII(CGF, &CGInfo);
return CGF.GenerateOpenMPCapturedStmtFunction(*CS);
}
llvm::Value *CGOpenMPRuntime::emitTaskOutlinedFunction(
const OMPExecutableDirective &D, const VarDecl *ThreadIDVar,
const VarDecl *PartIDVar, const VarDecl *TaskTVar,
OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen,
bool Tied, unsigned &NumberOfParts) {
auto &&UntiedCodeGen = [this, &D, TaskTVar](CodeGenFunction &CGF,
PrePostActionTy &) {
auto *ThreadID = getThreadID(CGF, D.getLocStart());
auto *UpLoc = emitUpdateLocation(CGF, D.getLocStart());
llvm::Value *TaskArgs[] = {
UpLoc, ThreadID,
CGF.EmitLoadOfPointerLValue(CGF.GetAddrOfLocalVar(TaskTVar),
TaskTVar->getType()->castAs<PointerType>())
.getPointer()};
CGF.EmitRuntimeCall(createRuntimeFunction(OMPRTL__kmpc_omp_task), TaskArgs);
};
CGOpenMPTaskOutlinedRegionInfo::UntiedTaskActionTy Action(Tied, PartIDVar,
UntiedCodeGen);
CodeGen.setAction(Action);
assert(!ThreadIDVar->getType()->isPointerType() &&
"thread id variable must be of type kmp_int32 for tasks");
auto *CS = cast<CapturedStmt>(D.getAssociatedStmt());
auto *TD = dyn_cast<OMPTaskDirective>(&D);
CodeGenFunction CGF(CGM, true);
CGOpenMPTaskOutlinedRegionInfo CGInfo(*CS, ThreadIDVar, CodeGen,
InnermostKind,
TD ? TD->hasCancel() : false, Action);
CodeGenFunction::CGCapturedStmtRAII CapInfoRAII(CGF, &CGInfo);
auto *Res = CGF.GenerateCapturedStmtFunction(*CS);
if (!Tied)
NumberOfParts = Action.getNumberOfParts();
return Res;
}
Address CGOpenMPRuntime::getOrCreateDefaultLocation(unsigned Flags) {
CharUnits Align = getIdentAlign(CGM);
llvm::Value *Entry = OpenMPDefaultLocMap.lookup(Flags);
if (!Entry) {
if (!DefaultOpenMPPSource) {
// Initialize default location for psource field of ident_t structure of
// all ident_t objects. Format is ";file;function;line;column;;".
// Taken from
// http://llvm.org/svn/llvm-project/openmp/trunk/runtime/src/kmp_str.c
DefaultOpenMPPSource =
CGM.GetAddrOfConstantCString(";unknown;unknown;0;0;;").getPointer();
DefaultOpenMPPSource =
llvm::ConstantExpr::getBitCast(DefaultOpenMPPSource, CGM.Int8PtrTy);
}
ConstantInitBuilder builder(CGM);
auto fields = builder.beginStruct(IdentTy);
fields.addInt(CGM.Int32Ty, 0);
fields.addInt(CGM.Int32Ty, Flags);
fields.addInt(CGM.Int32Ty, 0);
fields.addInt(CGM.Int32Ty, 0);
fields.add(DefaultOpenMPPSource);
auto DefaultOpenMPLocation =
fields.finishAndCreateGlobal("", Align, /*isConstant*/ true,
llvm::GlobalValue::PrivateLinkage);
DefaultOpenMPLocation->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global);
OpenMPDefaultLocMap[Flags] = Entry = DefaultOpenMPLocation;
}
return Address(Entry, Align);
}
llvm::Value *CGOpenMPRuntime::emitUpdateLocation(CodeGenFunction &CGF,
SourceLocation Loc,
unsigned Flags) {
Flags |= OMP_IDENT_KMPC;
// If no debug info is generated - return global default location.
if (CGM.getCodeGenOpts().getDebugInfo() == codegenoptions::NoDebugInfo ||
Loc.isInvalid())
return getOrCreateDefaultLocation(Flags).getPointer();
assert(CGF.CurFn && "No function in current CodeGenFunction.");
Address LocValue = Address::invalid();
auto I = OpenMPLocThreadIDMap.find(CGF.CurFn);
if (I != OpenMPLocThreadIDMap.end())
LocValue = Address(I->second.DebugLoc, getIdentAlign(CGF.CGM));
// OpenMPLocThreadIDMap may have null DebugLoc and non-null ThreadID, if
// GetOpenMPThreadID was called before this routine.
if (!LocValue.isValid()) {
// Generate "ident_t .kmpc_loc.addr;"
Address AI = CGF.CreateTempAlloca(IdentTy, getIdentAlign(CGF.CGM),
".kmpc_loc.addr");
auto &Elem = OpenMPLocThreadIDMap.FindAndConstruct(CGF.CurFn);
Elem.second.DebugLoc = AI.getPointer();
LocValue = AI;
CGBuilderTy::InsertPointGuard IPG(CGF.Builder);
CGF.Builder.SetInsertPoint(CGF.AllocaInsertPt);
CGF.Builder.CreateMemCpy(LocValue, getOrCreateDefaultLocation(Flags),
CGM.getSize(getIdentSize(CGF.CGM)));
}
// char **psource = &.kmpc_loc_<flags>.addr.psource;
Address PSource = createIdentFieldGEP(CGF, LocValue, IdentField_PSource);
auto OMPDebugLoc = OpenMPDebugLocMap.lookup(Loc.getRawEncoding());
if (OMPDebugLoc == nullptr) {
SmallString<128> Buffer2;
llvm::raw_svector_ostream OS2(Buffer2);
// Build debug location
PresumedLoc PLoc = CGF.getContext().getSourceManager().getPresumedLoc(Loc);
OS2 << ";" << PLoc.getFilename() << ";";
if (const FunctionDecl *FD =
dyn_cast_or_null<FunctionDecl>(CGF.CurFuncDecl)) {
OS2 << FD->getQualifiedNameAsString();
}
OS2 << ";" << PLoc.getLine() << ";" << PLoc.getColumn() << ";;";
OMPDebugLoc = CGF.Builder.CreateGlobalStringPtr(OS2.str());
OpenMPDebugLocMap[Loc.getRawEncoding()] = OMPDebugLoc;
}
// *psource = ";<File>;<Function>;<Line>;<Column>;;";
CGF.Builder.CreateStore(OMPDebugLoc, PSource);
// Our callers always pass this to a runtime function, so for
// convenience, go ahead and return a naked pointer.
return LocValue.getPointer();
}
llvm::Value *CGOpenMPRuntime::getThreadID(CodeGenFunction &CGF,
SourceLocation Loc) {
assert(CGF.CurFn && "No function in current CodeGenFunction.");
llvm::Value *ThreadID = nullptr;
// Check whether we've already cached a load of the thread id in this
// function.
auto I = OpenMPLocThreadIDMap.find(CGF.CurFn);
if (I != OpenMPLocThreadIDMap.end()) {
ThreadID = I->second.ThreadID;
if (ThreadID != nullptr)
return ThreadID;
}
if (auto *OMPRegionInfo =
dyn_cast_or_null<CGOpenMPRegionInfo>(CGF.CapturedStmtInfo)) {
if (OMPRegionInfo->getThreadIDVariable()) {
// Check if this an outlined function with thread id passed as argument.
auto LVal = OMPRegionInfo->getThreadIDVariableLValue(CGF);
ThreadID = CGF.EmitLoadOfLValue(LVal, Loc).getScalarVal();
// If value loaded in entry block, cache it and use it everywhere in
// function.
if (CGF.Builder.GetInsertBlock() == CGF.AllocaInsertPt->getParent()) {
auto &Elem = OpenMPLocThreadIDMap.FindAndConstruct(CGF.CurFn);
Elem.second.ThreadID = ThreadID;
}
return ThreadID;
}
}
// This is not an outlined function region - need to call __kmpc_int32
// kmpc_global_thread_num(ident_t *loc).
// Generate thread id value and cache this value for use across the
// function.
CGBuilderTy::InsertPointGuard IPG(CGF.Builder);
CGF.Builder.SetInsertPoint(CGF.AllocaInsertPt);
ThreadID =
CGF.EmitRuntimeCall(createRuntimeFunction(OMPRTL__kmpc_global_thread_num),
emitUpdateLocation(CGF, Loc));
auto &Elem = OpenMPLocThreadIDMap.FindAndConstruct(CGF.CurFn);
Elem.second.ThreadID = ThreadID;
return ThreadID;
}
void CGOpenMPRuntime::functionFinished(CodeGenFunction &CGF) {
assert(CGF.CurFn && "No function in current CodeGenFunction.");
if (OpenMPLocThreadIDMap.count(CGF.CurFn))
OpenMPLocThreadIDMap.erase(CGF.CurFn);
if (FunctionUDRMap.count(CGF.CurFn) > 0) {
for(auto *D : FunctionUDRMap[CGF.CurFn]) {
UDRMap.erase(D);
}
FunctionUDRMap.erase(CGF.CurFn);
}
}
llvm::Type *CGOpenMPRuntime::getIdentTyPointerTy() {
if (!IdentTy) {
}
return llvm::PointerType::getUnqual(IdentTy);
}
llvm::Type *CGOpenMPRuntime::getKmpc_MicroPointerTy() {
if (!Kmpc_MicroTy) {
// Build void (*kmpc_micro)(kmp_int32 *global_tid, kmp_int32 *bound_tid,...)
llvm::Type *MicroParams[] = {llvm::PointerType::getUnqual(CGM.Int32Ty),
llvm::PointerType::getUnqual(CGM.Int32Ty)};
Kmpc_MicroTy = llvm::FunctionType::get(CGM.VoidTy, MicroParams, true);
}
return llvm::PointerType::getUnqual(Kmpc_MicroTy);
}
llvm::Constant *
CGOpenMPRuntime::createRuntimeFunction(unsigned Function) {
llvm::Constant *RTLFn = nullptr;
switch (static_cast<OpenMPRTLFunction>(Function)) {
case OMPRTL__kmpc_fork_call: {
// Build void __kmpc_fork_call(ident_t *loc, kmp_int32 argc, kmpc_micro
// microtask, ...);
llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty,
getKmpc_MicroPointerTy()};
llvm::FunctionType *FnTy =
llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ true);
RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_fork_call");
break;
}
case OMPRTL__kmpc_global_thread_num: {
// Build kmp_int32 __kmpc_global_thread_num(ident_t *loc);
llvm::Type *TypeParams[] = {getIdentTyPointerTy()};
llvm::FunctionType *FnTy =
llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg*/ false);
RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_global_thread_num");
break;
}
case OMPRTL__kmpc_threadprivate_cached: {
// Build void *__kmpc_threadprivate_cached(ident_t *loc,
// kmp_int32 global_tid, void *data, size_t size, void ***cache);
llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty,
CGM.VoidPtrTy, CGM.SizeTy,
CGM.VoidPtrTy->getPointerTo()->getPointerTo()};
llvm::FunctionType *FnTy =
llvm::FunctionType::get(CGM.VoidPtrTy, TypeParams, /*isVarArg*/ false);
RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_threadprivate_cached");
break;
}
case OMPRTL__kmpc_critical: {
// Build void __kmpc_critical(ident_t *loc, kmp_int32 global_tid,
// kmp_critical_name *crit);
llvm::Type *TypeParams[] = {
getIdentTyPointerTy(), CGM.Int32Ty,
llvm::PointerType::getUnqual(KmpCriticalNameTy)};
llvm::FunctionType *FnTy =
llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_critical");
break;
}
case OMPRTL__kmpc_critical_with_hint: {
// Build void __kmpc_critical_with_hint(ident_t *loc, kmp_int32 global_tid,
// kmp_critical_name *crit, uintptr_t hint);
llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty,
llvm::PointerType::getUnqual(KmpCriticalNameTy),
CGM.IntPtrTy};
llvm::FunctionType *FnTy =
llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_critical_with_hint");
break;
}
case OMPRTL__kmpc_threadprivate_register: {
// Build void __kmpc_threadprivate_register(ident_t *, void *data,
// kmpc_ctor ctor, kmpc_cctor cctor, kmpc_dtor dtor);
// typedef void *(*kmpc_ctor)(void *);
auto KmpcCtorTy =
llvm::FunctionType::get(CGM.VoidPtrTy, CGM.VoidPtrTy,
/*isVarArg*/ false)->getPointerTo();
// typedef void *(*kmpc_cctor)(void *, void *);
llvm::Type *KmpcCopyCtorTyArgs[] = {CGM.VoidPtrTy, CGM.VoidPtrTy};
auto KmpcCopyCtorTy =
llvm::FunctionType::get(CGM.VoidPtrTy, KmpcCopyCtorTyArgs,
/*isVarArg*/ false)->getPointerTo();
// typedef void (*kmpc_dtor)(void *);
auto KmpcDtorTy =
llvm::FunctionType::get(CGM.VoidTy, CGM.VoidPtrTy, /*isVarArg*/ false)
->getPointerTo();
llvm::Type *FnTyArgs[] = {getIdentTyPointerTy(), CGM.VoidPtrTy, KmpcCtorTy,
KmpcCopyCtorTy, KmpcDtorTy};
auto FnTy = llvm::FunctionType::get(CGM.VoidTy, FnTyArgs,
/*isVarArg*/ false);
RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_threadprivate_register");
break;
}
case OMPRTL__kmpc_end_critical: {
// Build void __kmpc_end_critical(ident_t *loc, kmp_int32 global_tid,
// kmp_critical_name *crit);
llvm::Type *TypeParams[] = {
getIdentTyPointerTy(), CGM.Int32Ty,
llvm::PointerType::getUnqual(KmpCriticalNameTy)};
llvm::FunctionType *FnTy =
llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_end_critical");
break;
}
case OMPRTL__kmpc_cancel_barrier: {
// Build kmp_int32 __kmpc_cancel_barrier(ident_t *loc, kmp_int32
// global_tid);
llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty};
llvm::FunctionType *FnTy =
llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg*/ false);
RTLFn = CGM.CreateRuntimeFunction(FnTy, /*Name*/ "__kmpc_cancel_barrier");
break;
}
case OMPRTL__kmpc_barrier: {
// Build void __kmpc_barrier(ident_t *loc, kmp_int32 global_tid);
llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty};
llvm::FunctionType *FnTy =
llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
RTLFn = CGM.CreateRuntimeFunction(FnTy, /*Name*/ "__kmpc_barrier");
break;
}
case OMPRTL__kmpc_for_static_fini: {
// Build void __kmpc_for_static_fini(ident_t *loc, kmp_int32 global_tid);
llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty};
llvm::FunctionType *FnTy =
llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_for_static_fini");
break;
}
case OMPRTL__kmpc_push_num_threads: {
// Build void __kmpc_push_num_threads(ident_t *loc, kmp_int32 global_tid,
// kmp_int32 num_threads)
llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty,
CGM.Int32Ty};
llvm::FunctionType *FnTy =
llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_push_num_threads");
break;
}
case OMPRTL__kmpc_serialized_parallel: {
// Build void __kmpc_serialized_parallel(ident_t *loc, kmp_int32
// global_tid);
llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty};
llvm::FunctionType *FnTy =
llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_serialized_parallel");
break;
}
case OMPRTL__kmpc_end_serialized_parallel: {
// Build void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32
// global_tid);
llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty};
llvm::FunctionType *FnTy =
llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_end_serialized_parallel");
break;
}
case OMPRTL__kmpc_flush: {
// Build void __kmpc_flush(ident_t *loc);
llvm::Type *TypeParams[] = {getIdentTyPointerTy()};
llvm::FunctionType *FnTy =
llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_flush");
break;
}
case OMPRTL__kmpc_master: {
// Build kmp_int32 __kmpc_master(ident_t *loc, kmp_int32 global_tid);
llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty};
llvm::FunctionType *FnTy =
llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg=*/false);
RTLFn = CGM.CreateRuntimeFunction(FnTy, /*Name=*/"__kmpc_master");
break;
}
case OMPRTL__kmpc_end_master: {
// Build void __kmpc_end_master(ident_t *loc, kmp_int32 global_tid);
llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty};
llvm::FunctionType *FnTy =
llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg=*/false);
RTLFn = CGM.CreateRuntimeFunction(FnTy, /*Name=*/"__kmpc_end_master");
break;
}
case OMPRTL__kmpc_omp_taskyield: {
// Build kmp_int32 __kmpc_omp_taskyield(ident_t *, kmp_int32 global_tid,
// int end_part);
llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty, CGM.IntTy};
llvm::FunctionType *FnTy =
llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg=*/false);
RTLFn = CGM.CreateRuntimeFunction(FnTy, /*Name=*/"__kmpc_omp_taskyield");
break;
}
case OMPRTL__kmpc_single: {
// Build kmp_int32 __kmpc_single(ident_t *loc, kmp_int32 global_tid);
llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty};
llvm::FunctionType *FnTy =
llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg=*/false);
RTLFn = CGM.CreateRuntimeFunction(FnTy, /*Name=*/"__kmpc_single");
break;
}
case OMPRTL__kmpc_end_single: {
// Build void __kmpc_end_single(ident_t *loc, kmp_int32 global_tid);
llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty};
llvm::FunctionType *FnTy =
llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg=*/false);
RTLFn = CGM.CreateRuntimeFunction(FnTy, /*Name=*/"__kmpc_end_single");
break;
}
case OMPRTL__kmpc_omp_task_alloc: {
// Build kmp_task_t *__kmpc_omp_task_alloc(ident_t *, kmp_int32 gtid,
// kmp_int32 flags, size_t sizeof_kmp_task_t, size_t sizeof_shareds,
// kmp_routine_entry_t *task_entry);
assert(KmpRoutineEntryPtrTy != nullptr &&
"Type kmp_routine_entry_t must be created.");
llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty, CGM.Int32Ty,
CGM.SizeTy, CGM.SizeTy, KmpRoutineEntryPtrTy};
// Return void * and then cast to particular kmp_task_t type.
llvm::FunctionType *FnTy =
llvm::FunctionType::get(CGM.VoidPtrTy, TypeParams, /*isVarArg=*/false);
RTLFn = CGM.CreateRuntimeFunction(FnTy, /*Name=*/"__kmpc_omp_task_alloc");
break;
}
case OMPRTL__kmpc_omp_task: {
// Build kmp_int32 __kmpc_omp_task(ident_t *, kmp_int32 gtid, kmp_task_t
// *new_task);
llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty,
CGM.VoidPtrTy};
llvm::FunctionType *FnTy =
llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg=*/false);
RTLFn = CGM.CreateRuntimeFunction(FnTy, /*Name=*/"__kmpc_omp_task");
break;
}
case OMPRTL__kmpc_copyprivate: {
// Build void __kmpc_copyprivate(ident_t *loc, kmp_int32 global_tid,
// size_t cpy_size, void *cpy_data, void(*cpy_func)(void *, void *),
// kmp_int32 didit);
llvm::Type *CpyTypeParams[] = {CGM.VoidPtrTy, CGM.VoidPtrTy};
auto *CpyFnTy =
llvm::FunctionType::get(CGM.VoidTy, CpyTypeParams, /*isVarArg=*/false);
llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty, CGM.SizeTy,
CGM.VoidPtrTy, CpyFnTy->getPointerTo(),
CGM.Int32Ty};
llvm::FunctionType *FnTy =
llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg=*/false);
RTLFn = CGM.CreateRuntimeFunction(FnTy, /*Name=*/"__kmpc_copyprivate");
break;
}
case OMPRTL__kmpc_reduce: {
// Build kmp_int32 __kmpc_reduce(ident_t *loc, kmp_int32 global_tid,
// kmp_int32 num_vars, size_t reduce_size, void *reduce_data, void
// (*reduce_func)(void *lhs_data, void *rhs_data), kmp_critical_name *lck);
llvm::Type *ReduceTypeParams[] = {CGM.VoidPtrTy, CGM.VoidPtrTy};
auto *ReduceFnTy = llvm::FunctionType::get(CGM.VoidTy, ReduceTypeParams,
/*isVarArg=*/false);
llvm::Type *TypeParams[] = {
getIdentTyPointerTy(), CGM.Int32Ty, CGM.Int32Ty, CGM.SizeTy,
CGM.VoidPtrTy, ReduceFnTy->getPointerTo(),
llvm::PointerType::getUnqual(KmpCriticalNameTy)};
llvm::FunctionType *FnTy =
llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg=*/false);
RTLFn = CGM.CreateRuntimeFunction(FnTy, /*Name=*/"__kmpc_reduce");
break;
}
case OMPRTL__kmpc_reduce_nowait: {
// Build kmp_int32 __kmpc_reduce_nowait(ident_t *loc, kmp_int32
// global_tid, kmp_int32 num_vars, size_t reduce_size, void *reduce_data,
// void (*reduce_func)(void *lhs_data, void *rhs_data), kmp_critical_name
// *lck);
llvm::Type *ReduceTypeParams[] = {CGM.VoidPtrTy, CGM.VoidPtrTy};
auto *ReduceFnTy = llvm::FunctionType::get(CGM.VoidTy, ReduceTypeParams,
/*isVarArg=*/false);
llvm::Type *TypeParams[] = {
getIdentTyPointerTy(), CGM.Int32Ty, CGM.Int32Ty, CGM.SizeTy,
CGM.VoidPtrTy, ReduceFnTy->getPointerTo(),
llvm::PointerType::getUnqual(KmpCriticalNameTy)};
llvm::FunctionType *FnTy =
llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg=*/false);
RTLFn = CGM.CreateRuntimeFunction(FnTy, /*Name=*/"__kmpc_reduce_nowait");
break;
}
case OMPRTL__kmpc_end_reduce: {
// Build void __kmpc_end_reduce(ident_t *loc, kmp_int32 global_tid,
// kmp_critical_name *lck);
llvm::Type *TypeParams[] = {
getIdentTyPointerTy(), CGM.Int32Ty,
llvm::PointerType::getUnqual(KmpCriticalNameTy)};
llvm::FunctionType *FnTy =
llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg=*/false);
RTLFn = CGM.CreateRuntimeFunction(FnTy, /*Name=*/"__kmpc_end_reduce");
break;
}
case OMPRTL__kmpc_end_reduce_nowait: {
// Build __kmpc_end_reduce_nowait(ident_t *loc, kmp_int32 global_tid,
// kmp_critical_name *lck);
llvm::Type *TypeParams[] = {
getIdentTyPointerTy(), CGM.Int32Ty,
llvm::PointerType::getUnqual(KmpCriticalNameTy)};
llvm::FunctionType *FnTy =
llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg=*/false);
RTLFn =
CGM.CreateRuntimeFunction(FnTy, /*Name=*/"__kmpc_end_reduce_nowait");
break;
}
case OMPRTL__kmpc_omp_task_begin_if0: {
// Build void __kmpc_omp_task(ident_t *, kmp_int32 gtid, kmp_task_t
// *new_task);
llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty,
CGM.VoidPtrTy};
llvm::FunctionType *FnTy =
llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg=*/false);
RTLFn =
CGM.CreateRuntimeFunction(FnTy, /*Name=*/"__kmpc_omp_task_begin_if0");
break;
}
case OMPRTL__kmpc_omp_task_complete_if0: {
// Build void __kmpc_omp_task(ident_t *, kmp_int32 gtid, kmp_task_t
// *new_task);
llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty,
CGM.VoidPtrTy};
llvm::FunctionType *FnTy =
llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg=*/false);
RTLFn = CGM.CreateRuntimeFunction(FnTy,
/*Name=*/"__kmpc_omp_task_complete_if0");
break;
}
case OMPRTL__kmpc_ordered: {
// Build void __kmpc_ordered(ident_t *loc, kmp_int32 global_tid);
llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty};
llvm::FunctionType *FnTy =
llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg=*/false);
RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_ordered");
break;
}
case OMPRTL__kmpc_end_ordered: {
// Build void __kmpc_end_ordered(ident_t *loc, kmp_int32 global_tid);
llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty};
llvm::FunctionType *FnTy =
llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg=*/false);
RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_end_ordered");
break;
}
case OMPRTL__kmpc_omp_taskwait: {
// Build kmp_int32 __kmpc_omp_taskwait(ident_t *loc, kmp_int32 global_tid);
llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty};
llvm::FunctionType *FnTy =
llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg=*/false);
RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_omp_taskwait");
break;
}
case OMPRTL__kmpc_taskgroup: {
// Build void __kmpc_taskgroup(ident_t *loc, kmp_int32 global_tid);
llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty};
llvm::FunctionType *FnTy =
llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg=*/false);
RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_taskgroup");
break;
}
case OMPRTL__kmpc_end_taskgroup: {
// Build void __kmpc_end_taskgroup(ident_t *loc, kmp_int32 global_tid);
llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty};
llvm::FunctionType *FnTy =
llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg=*/false);
RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_end_taskgroup");
break;
}
case OMPRTL__kmpc_push_proc_bind: {
// Build void __kmpc_push_proc_bind(ident_t *loc, kmp_int32 global_tid,
// int proc_bind)
llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty, CGM.IntTy};
llvm::FunctionType *FnTy =
llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_push_proc_bind");
break;
}
case OMPRTL__kmpc_omp_task_with_deps: {
// Build kmp_int32 __kmpc_omp_task_with_deps(ident_t *, kmp_int32 gtid,
// kmp_task_t *new_task, kmp_int32 ndeps, kmp_depend_info_t *dep_list,
// kmp_int32 ndeps_noalias, kmp_depend_info_t *noalias_dep_list);
llvm::Type *TypeParams[] = {
getIdentTyPointerTy(), CGM.Int32Ty, CGM.VoidPtrTy, CGM.Int32Ty,
CGM.VoidPtrTy, CGM.Int32Ty, CGM.VoidPtrTy};
llvm::FunctionType *FnTy =
llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg=*/false);
RTLFn =
CGM.CreateRuntimeFunction(FnTy, /*Name=*/"__kmpc_omp_task_with_deps");
break;
}
case OMPRTL__kmpc_omp_wait_deps: {
// Build void __kmpc_omp_wait_deps(ident_t *, kmp_int32 gtid,
// kmp_int32 ndeps, kmp_depend_info_t *dep_list, kmp_int32 ndeps_noalias,
// kmp_depend_info_t *noalias_dep_list);
llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty,
CGM.Int32Ty, CGM.VoidPtrTy,
CGM.Int32Ty, CGM.VoidPtrTy};
llvm::FunctionType *FnTy =
llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg=*/false);
RTLFn = CGM.CreateRuntimeFunction(FnTy, /*Name=*/"__kmpc_omp_wait_deps");
break;
}
case OMPRTL__kmpc_cancellationpoint: {
// Build kmp_int32 __kmpc_cancellationpoint(ident_t *loc, kmp_int32
// global_tid, kmp_int32 cncl_kind)
llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty, CGM.IntTy};
llvm::FunctionType *FnTy =
llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg*/ false);
RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_cancellationpoint");
break;
}
case OMPRTL__kmpc_cancel: {
// Build kmp_int32 __kmpc_cancel(ident_t *loc, kmp_int32 global_tid,
// kmp_int32 cncl_kind)
llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty, CGM.IntTy};
llvm::FunctionType *FnTy =
llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg*/ false);
RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_cancel");
break;
}
case OMPRTL__kmpc_push_num_teams: {
// Build void kmpc_push_num_teams (ident_t loc, kmp_int32 global_tid,
// kmp_int32 num_teams, kmp_int32 num_threads)
llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty, CGM.Int32Ty,
CGM.Int32Ty};
llvm::FunctionType *FnTy =
llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg*/ false);
RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_push_num_teams");
break;
}
case OMPRTL__kmpc_fork_teams: {
// Build void __kmpc_fork_teams(ident_t *loc, kmp_int32 argc, kmpc_micro
// microtask, ...);
llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty,
getKmpc_MicroPointerTy()};
llvm::FunctionType *FnTy =
llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ true);
RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_fork_teams");
break;
}
case OMPRTL__kmpc_taskloop: {
// Build void __kmpc_taskloop(ident_t *loc, int gtid, kmp_task_t *task, int
// if_val, kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st, int nogroup, int
// sched, kmp_uint64 grainsize, void *task_dup);
llvm::Type *TypeParams[] = {getIdentTyPointerTy(),
CGM.IntTy,
CGM.VoidPtrTy,
CGM.IntTy,
CGM.Int64Ty->getPointerTo(),
CGM.Int64Ty->getPointerTo(),
CGM.Int64Ty,
CGM.IntTy,
CGM.IntTy,
CGM.Int64Ty,
CGM.VoidPtrTy};
llvm::FunctionType *FnTy =
llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg=*/false);
RTLFn = CGM.CreateRuntimeFunction(FnTy, /*Name=*/"__kmpc_taskloop");
break;
}
case OMPRTL__kmpc_doacross_init: {
// Build void __kmpc_doacross_init(ident_t *loc, kmp_int32 gtid, kmp_int32
// num_dims, struct kmp_dim *dims);
llvm::Type *TypeParams[] = {getIdentTyPointerTy(),
CGM.Int32Ty,
CGM.Int32Ty,
CGM.VoidPtrTy};
llvm::FunctionType *FnTy =
llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg=*/false);
RTLFn = CGM.CreateRuntimeFunction(FnTy, /*Name=*/"__kmpc_doacross_init");
break;
}
case OMPRTL__kmpc_doacross_fini: {
// Build void __kmpc_doacross_fini(ident_t *loc, kmp_int32 gtid);
llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty};
llvm::FunctionType *FnTy =
llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg=*/false);
RTLFn = CGM.CreateRuntimeFunction(FnTy, /*Name=*/"__kmpc_doacross_fini");
break;
}
case OMPRTL__kmpc_doacross_post: {
// Build void __kmpc_doacross_post(ident_t *loc, kmp_int32 gtid, kmp_int64
// *vec);
llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty,
CGM.Int64Ty->getPointerTo()};
llvm::FunctionType *FnTy =
llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg=*/false);
RTLFn = CGM.CreateRuntimeFunction(FnTy, /*Name=*/"__kmpc_doacross_post");
break;
}
case OMPRTL__kmpc_doacross_wait: {
// Build void __kmpc_doacross_wait(ident_t *loc, kmp_int32 gtid, kmp_int64
// *vec);
llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty,
CGM.Int64Ty->getPointerTo()};
llvm::FunctionType *FnTy =
llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg=*/false);
RTLFn = CGM.CreateRuntimeFunction(FnTy, /*Name=*/"__kmpc_doacross_wait");
break;
}
case OMPRTL__tgt_target: {
// Build int32_t __tgt_target(int32_t device_id, void *host_ptr, int32_t
// arg_num, void** args_base, void **args, size_t *arg_sizes, int32_t
// *arg_types);
llvm::Type *TypeParams[] = {CGM.Int32Ty,
CGM.VoidPtrTy,
CGM.Int32Ty,
CGM.VoidPtrPtrTy,
CGM.VoidPtrPtrTy,
CGM.SizeTy->getPointerTo(),
CGM.Int32Ty->getPointerTo()};
llvm::FunctionType *FnTy =
llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg*/ false);
RTLFn = CGM.CreateRuntimeFunction(FnTy, "__tgt_target");
break;
}
case OMPRTL__tgt_target_teams: {
// Build int32_t __tgt_target_teams(int32_t device_id, void *host_ptr,
// int32_t arg_num, void** args_base, void **args, size_t *arg_sizes,
// int32_t *arg_types, int32_t num_teams, int32_t thread_limit);
llvm::Type *TypeParams[] = {CGM.Int32Ty,
CGM.VoidPtrTy,
CGM.Int32Ty,
CGM.VoidPtrPtrTy,
CGM.VoidPtrPtrTy,
CGM.SizeTy->getPointerTo(),
CGM.Int32Ty->getPointerTo(),
CGM.Int32Ty,
CGM.Int32Ty};
llvm::FunctionType *FnTy =
llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg*/ false);
RTLFn = CGM.CreateRuntimeFunction(FnTy, "__tgt_target_teams");
break;
}
case OMPRTL__tgt_register_lib: {
// Build void __tgt_register_lib(__tgt_bin_desc *desc);
QualType ParamTy =
CGM.getContext().getPointerType(getTgtBinaryDescriptorQTy());
llvm::Type *TypeParams[] = {CGM.getTypes().ConvertTypeForMem(ParamTy)};
llvm::FunctionType *FnTy =
llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg*/ false);
RTLFn = CGM.CreateRuntimeFunction(FnTy, "__tgt_register_lib");
break;
}
case OMPRTL__tgt_unregister_lib: {
// Build void __tgt_unregister_lib(__tgt_bin_desc *desc);
QualType ParamTy =
CGM.getContext().getPointerType(getTgtBinaryDescriptorQTy());
llvm::Type *TypeParams[] = {CGM.getTypes().ConvertTypeForMem(ParamTy)};
llvm::FunctionType *FnTy =
llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg*/ false);
RTLFn = CGM.CreateRuntimeFunction(FnTy, "__tgt_unregister_lib");
break;
}
case OMPRTL__tgt_target_data_begin: {
// Build void __tgt_target_data_begin(int32_t device_id, int32_t arg_num,
// void** args_base, void **args, size_t *arg_sizes, int32_t *arg_types);
llvm::Type *TypeParams[] = {CGM.Int32Ty,
CGM.Int32Ty,
CGM.VoidPtrPtrTy,
CGM.VoidPtrPtrTy,
CGM.SizeTy->getPointerTo(),
CGM.Int32Ty->getPointerTo()};
llvm::FunctionType *FnTy =
llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
RTLFn = CGM.CreateRuntimeFunction(FnTy, "__tgt_target_data_begin");
break;
}
case OMPRTL__tgt_target_data_end: {
// Build void __tgt_target_data_end(int32_t device_id, int32_t arg_num,
// void** args_base, void **args, size_t *arg_sizes, int32_t *arg_types);
llvm::Type *TypeParams[] = {CGM.Int32Ty,
CGM.Int32Ty,
CGM.VoidPtrPtrTy,
CGM.VoidPtrPtrTy,
CGM.SizeTy->getPointerTo(),
CGM.Int32Ty->getPointerTo()};
llvm::FunctionType *FnTy =
llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
RTLFn = CGM.CreateRuntimeFunction(FnTy, "__tgt_target_data_end");
break;
}
case OMPRTL__tgt_target_data_update: {
// Build void __tgt_target_data_update(int32_t device_id, int32_t arg_num,
// void** args_base, void **args, size_t *arg_sizes, int32_t *arg_types);
llvm::Type *TypeParams[] = {CGM.Int32Ty,
CGM.Int32Ty,
CGM.VoidPtrPtrTy,
CGM.VoidPtrPtrTy,
CGM.SizeTy->getPointerTo(),
CGM.Int32Ty->getPointerTo()};
llvm::FunctionType *FnTy =
llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
RTLFn = CGM.CreateRuntimeFunction(FnTy, "__tgt_target_data_update");
break;
}
}
assert(RTLFn && "Unable to find OpenMP runtime function");
return RTLFn;
}
llvm::Constant *CGOpenMPRuntime::createForStaticInitFunction(unsigned IVSize,
bool IVSigned) {
assert((IVSize == 32 || IVSize == 64) &&
"IV size is not compatible with the omp runtime");
auto Name = IVSize == 32 ? (IVSigned ? "__kmpc_for_static_init_4"
: "__kmpc_for_static_init_4u")
: (IVSigned ? "__kmpc_for_static_init_8"
: "__kmpc_for_static_init_8u");
auto ITy = IVSize == 32 ? CGM.Int32Ty : CGM.Int64Ty;
auto PtrTy = llvm::PointerType::getUnqual(ITy);
llvm::Type *TypeParams[] = {
getIdentTyPointerTy(), // loc
CGM.Int32Ty, // tid
CGM.Int32Ty, // schedtype
llvm::PointerType::getUnqual(CGM.Int32Ty), // p_lastiter
PtrTy, // p_lower
PtrTy, // p_upper
PtrTy, // p_stride
ITy, // incr
ITy // chunk
};
llvm::FunctionType *FnTy =
llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
return CGM.CreateRuntimeFunction(FnTy, Name);
}
llvm::Constant *CGOpenMPRuntime::createDispatchInitFunction(unsigned IVSize,
bool IVSigned) {
assert((IVSize == 32 || IVSize == 64) &&
"IV size is not compatible with the omp runtime");
auto Name =
IVSize == 32
? (IVSigned ? "__kmpc_dispatch_init_4" : "__kmpc_dispatch_init_4u")
: (IVSigned ? "__kmpc_dispatch_init_8" : "__kmpc_dispatch_init_8u");
auto ITy = IVSize == 32 ? CGM.Int32Ty : CGM.Int64Ty;
llvm::Type *TypeParams[] = { getIdentTyPointerTy(), // loc
CGM.Int32Ty, // tid
CGM.Int32Ty, // schedtype
ITy, // lower
ITy, // upper
ITy, // stride
ITy // chunk
};
llvm::FunctionType *FnTy =
llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
return CGM.CreateRuntimeFunction(FnTy, Name);
}
llvm::Constant *CGOpenMPRuntime::createDispatchFiniFunction(unsigned IVSize,
bool IVSigned) {
assert((IVSize == 32 || IVSize == 64) &&
"IV size is not compatible with the omp runtime");
auto Name =
IVSize == 32
? (IVSigned ? "__kmpc_dispatch_fini_4" : "__kmpc_dispatch_fini_4u")
: (IVSigned ? "__kmpc_dispatch_fini_8" : "__kmpc_dispatch_fini_8u");
llvm::Type *TypeParams[] = {
getIdentTyPointerTy(), // loc
CGM.Int32Ty, // tid
};
llvm::FunctionType *FnTy =
llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg=*/false);
return CGM.CreateRuntimeFunction(FnTy, Name);
}
llvm::Constant *CGOpenMPRuntime::createDispatchNextFunction(unsigned IVSize,
bool IVSigned) {
assert((IVSize == 32 || IVSize == 64) &&
"IV size is not compatible with the omp runtime");
auto Name =
IVSize == 32
? (IVSigned ? "__kmpc_dispatch_next_4" : "__kmpc_dispatch_next_4u")
: (IVSigned ? "__kmpc_dispatch_next_8" : "__kmpc_dispatch_next_8u");
auto ITy = IVSize == 32 ? CGM.Int32Ty : CGM.Int64Ty;
auto PtrTy = llvm::PointerType::getUnqual(ITy);
llvm::Type *TypeParams[] = {
getIdentTyPointerTy(), // loc
CGM.Int32Ty, // tid
llvm::PointerType::getUnqual(CGM.Int32Ty), // p_lastiter
PtrTy, // p_lower
PtrTy, // p_upper
PtrTy // p_stride
};
llvm::FunctionType *FnTy =
llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg*/ false);
return CGM.CreateRuntimeFunction(FnTy, Name);
}
llvm::Constant *
CGOpenMPRuntime::getOrCreateThreadPrivateCache(const VarDecl *VD) {
assert(!CGM.getLangOpts().OpenMPUseTLS ||
!CGM.getContext().getTargetInfo().isTLSSupported());
// Lookup the entry, lazily creating it if necessary.
return getOrCreateInternalVariable(CGM.Int8PtrPtrTy,
Twine(CGM.getMangledName(VD)) + ".cache.");
}
Address CGOpenMPRuntime::getAddrOfThreadPrivate(CodeGenFunction &CGF,
const VarDecl *VD,
Address VDAddr,
SourceLocation Loc) {
if (CGM.getLangOpts().OpenMPUseTLS &&
CGM.getContext().getTargetInfo().isTLSSupported())
return VDAddr;
auto VarTy = VDAddr.getElementType();
llvm::Value *Args[] = {emitUpdateLocation(CGF, Loc), getThreadID(CGF, Loc),
CGF.Builder.CreatePointerCast(VDAddr.getPointer(),
CGM.Int8PtrTy),
CGM.getSize(CGM.GetTargetTypeStoreSize(VarTy)),
getOrCreateThreadPrivateCache(VD)};
return Address(CGF.EmitRuntimeCall(
createRuntimeFunction(OMPRTL__kmpc_threadprivate_cached), Args),
VDAddr.getAlignment());
}
void CGOpenMPRuntime::emitThreadPrivateVarInit(
CodeGenFunction &CGF, Address VDAddr, llvm::Value *Ctor,
llvm::Value *CopyCtor, llvm::Value *Dtor, SourceLocation Loc) {
// Call kmp_int32 __kmpc_global_thread_num(&loc) to init OpenMP runtime
// library.
auto OMPLoc = emitUpdateLocation(CGF, Loc);
CGF.EmitRuntimeCall(createRuntimeFunction(OMPRTL__kmpc_global_thread_num),
OMPLoc);
// Call __kmpc_threadprivate_register(&loc, &var, ctor, cctor/*NULL*/, dtor)
// to register constructor/destructor for variable.
llvm::Value *Args[] = {OMPLoc,
CGF.Builder.CreatePointerCast(VDAddr.getPointer(),
CGM.VoidPtrTy),
Ctor, CopyCtor, Dtor};
CGF.EmitRuntimeCall(
createRuntimeFunction(OMPRTL__kmpc_threadprivate_register), Args);
}
llvm::Function *CGOpenMPRuntime::emitThreadPrivateVarDefinition(
const VarDecl *VD, Address VDAddr, SourceLocation Loc,
bool PerformInit, CodeGenFunction *CGF) {
if (CGM.getLangOpts().OpenMPUseTLS &&
CGM.getContext().getTargetInfo().isTLSSupported())
return nullptr;
VD = VD->getDefinition(CGM.getContext());
if (VD && ThreadPrivateWithDefinition.count(VD) == 0) {
ThreadPrivateWithDefinition.insert(VD);
QualType ASTTy = VD->getType();
llvm::Value *Ctor = nullptr, *CopyCtor = nullptr, *Dtor = nullptr;
auto Init = VD->getAnyInitializer();
if (CGM.getLangOpts().CPlusPlus && PerformInit) {
// Generate function that re-emits the declaration's initializer into the
// threadprivate copy of the variable VD
CodeGenFunction CtorCGF(CGM);
FunctionArgList Args;
ImplicitParamDecl Dst(CGM.getContext(), /*DC=*/nullptr, SourceLocation(),
/*Id=*/nullptr, CGM.getContext().VoidPtrTy);
Args.push_back(&Dst);
auto &FI = CGM.getTypes().arrangeBuiltinFunctionDeclaration(
CGM.getContext().VoidPtrTy, Args);
auto FTy = CGM.getTypes().GetFunctionType(FI);
auto Fn = CGM.CreateGlobalInitOrDestructFunction(
FTy, ".__kmpc_global_ctor_.", FI, Loc);
CtorCGF.StartFunction(GlobalDecl(), CGM.getContext().VoidPtrTy, Fn, FI,
Args, SourceLocation());
auto ArgVal = CtorCGF.EmitLoadOfScalar(
CtorCGF.GetAddrOfLocalVar(&Dst), /*Volatile=*/false,
CGM.getContext().VoidPtrTy, Dst.getLocation());
Address Arg = Address(ArgVal, VDAddr.getAlignment());
Arg = CtorCGF.Builder.CreateElementBitCast(Arg,
CtorCGF.ConvertTypeForMem(ASTTy));
CtorCGF.EmitAnyExprToMem(Init, Arg, Init->getType().getQualifiers(),
/*IsInitializer=*/true);
ArgVal = CtorCGF.EmitLoadOfScalar(
CtorCGF.GetAddrOfLocalVar(&Dst), /*Volatile=*/false,
CGM.getContext().VoidPtrTy, Dst.getLocation());
CtorCGF.Builder.CreateStore(ArgVal, CtorCGF.ReturnValue);
CtorCGF.FinishFunction();
Ctor = Fn;
}
if (VD->getType().isDestructedType() != QualType::DK_none) {
// Generate function that emits destructor call for the threadprivate copy
// of the variable VD
CodeGenFunction DtorCGF(CGM);
FunctionArgList Args;
ImplicitParamDecl Dst(CGM.getContext(), /*DC=*/nullptr, SourceLocation(),
/*Id=*/nullptr, CGM.getContext().VoidPtrTy);
Args.push_back(&Dst);
auto &FI = CGM.getTypes().arrangeBuiltinFunctionDeclaration(
CGM.getContext().VoidTy, Args);
auto FTy = CGM.getTypes().GetFunctionType(FI);
auto Fn = CGM.CreateGlobalInitOrDestructFunction(
FTy, ".__kmpc_global_dtor_.", FI, Loc);
auto NL = ApplyDebugLocation::CreateEmpty(DtorCGF);
DtorCGF.StartFunction(GlobalDecl(), CGM.getContext().VoidTy, Fn, FI, Args,
SourceLocation());
// Create a scope with an artificial location for the body of this function.
auto AL = ApplyDebugLocation::CreateArtificial(DtorCGF);
auto ArgVal = DtorCGF.EmitLoadOfScalar(
DtorCGF.GetAddrOfLocalVar(&Dst),
/*Volatile=*/false, CGM.getContext().VoidPtrTy, Dst.getLocation());
DtorCGF.emitDestroy(Address(ArgVal, VDAddr.getAlignment()), ASTTy,
DtorCGF.getDestroyer(ASTTy.isDestructedType()),
DtorCGF.needsEHCleanup(ASTTy.isDestructedType()));
DtorCGF.FinishFunction();
Dtor = Fn;
}
// Do not emit init function if it is not required.
if (!Ctor && !Dtor)
return nullptr;
llvm::Type *CopyCtorTyArgs[] = {CGM.VoidPtrTy, CGM.VoidPtrTy};
auto CopyCtorTy =
llvm::FunctionType::get(CGM.VoidPtrTy, CopyCtorTyArgs,
/*isVarArg=*/false)->getPointerTo();
// Copying constructor for the threadprivate variable.
// Must be NULL - reserved by runtime, but currently it requires that this
// parameter is always NULL. Otherwise it fires assertion.
CopyCtor = llvm::Constant::getNullValue(CopyCtorTy);
if (Ctor == nullptr) {
auto CtorTy = llvm::FunctionType::get(CGM.VoidPtrTy, CGM.VoidPtrTy,
/*isVarArg=*/false)->getPointerTo();
Ctor = llvm::Constant::getNullValue(CtorTy);
}
if (Dtor == nullptr) {
auto DtorTy = llvm::FunctionType::get(CGM.VoidTy, CGM.VoidPtrTy,
/*isVarArg=*/false)->getPointerTo();
Dtor = llvm::Constant::getNullValue(DtorTy);
}
if (!CGF) {
auto InitFunctionTy =
llvm::FunctionType::get(CGM.VoidTy, /*isVarArg*/ false);
auto InitFunction = CGM.CreateGlobalInitOrDestructFunction(
InitFunctionTy, ".__omp_threadprivate_init_.",
CGM.getTypes().arrangeNullaryFunction());
CodeGenFunction InitCGF(CGM);
FunctionArgList ArgList;
InitCGF.StartFunction(GlobalDecl(), CGM.getContext().VoidTy, InitFunction,
CGM.getTypes().arrangeNullaryFunction(), ArgList,
Loc);
emitThreadPrivateVarInit(InitCGF, VDAddr, Ctor, CopyCtor, Dtor, Loc);
InitCGF.FinishFunction();
return InitFunction;
}
emitThreadPrivateVarInit(*CGF, VDAddr, Ctor, CopyCtor, Dtor, Loc);
}
return nullptr;
}
/// \brief Emits code for OpenMP 'if' clause using specified \a CodeGen
/// function. Here is the logic:
/// if (Cond) {
/// ThenGen();
/// } else {
/// ElseGen();
/// }
void CGOpenMPRuntime::emitOMPIfClause(CodeGenFunction &CGF, const Expr *Cond,
const RegionCodeGenTy &ThenGen,
const RegionCodeGenTy &ElseGen) {
CodeGenFunction::LexicalScope ConditionScope(CGF, Cond->getSourceRange());
// If the condition constant folds and can be elided, try to avoid emitting
// the condition and the dead arm of the if/else.
bool CondConstant;
if (CGF.ConstantFoldsToSimpleInteger(Cond, CondConstant)) {
if (CondConstant)
ThenGen(CGF);
else
ElseGen(CGF);
return;
}
// Otherwise, the condition did not fold, or we couldn't elide it. Just
// emit the conditional branch.
auto ThenBlock = CGF.createBasicBlock("omp_if.then");
auto ElseBlock = CGF.createBasicBlock("omp_if.else");
auto ContBlock = CGF.createBasicBlock("omp_if.end");
CGF.EmitBranchOnBoolExpr(Cond, ThenBlock, ElseBlock, /*TrueCount=*/0);
// Emit the 'then' code.
CGF.EmitBlock(ThenBlock);
ThenGen(CGF);
CGF.EmitBranch(ContBlock);
// Emit the 'else' code if present.
// There is no need to emit line number for unconditional branch.
(void)ApplyDebugLocation::CreateEmpty(CGF);
CGF.EmitBlock(ElseBlock);
ElseGen(CGF);
// There is no need to emit line number for unconditional branch.
(void)ApplyDebugLocation::CreateEmpty(CGF);
CGF.EmitBranch(ContBlock);
// Emit the continuation block for code after the if.
CGF.EmitBlock(ContBlock, /*IsFinished=*/true);
}
void CGOpenMPRuntime::emitParallelCall(CodeGenFunction &CGF, SourceLocation Loc,
llvm::Value *OutlinedFn,
ArrayRef<llvm::Value *> CapturedVars,
const Expr *IfCond) {
if (!CGF.HaveInsertPoint())
return;
auto *RTLoc = emitUpdateLocation(CGF, Loc);
auto &&ThenGen = [OutlinedFn, CapturedVars, RTLoc](CodeGenFunction &CGF,
PrePostActionTy &) {
// Build call __kmpc_fork_call(loc, n, microtask, var1, .., varn);
auto &RT = CGF.CGM.getOpenMPRuntime();
llvm::Value *Args[] = {
RTLoc,
CGF.Builder.getInt32(CapturedVars.size()), // Number of captured vars
CGF.Builder.CreateBitCast(OutlinedFn, RT.getKmpc_MicroPointerTy())};
llvm::SmallVector<llvm::Value *, 16> RealArgs;
RealArgs.append(std::begin(Args), std::end(Args));
RealArgs.append(CapturedVars.begin(), CapturedVars.end());
auto RTLFn = RT.createRuntimeFunction(OMPRTL__kmpc_fork_call);
CGF.EmitRuntimeCall(RTLFn, RealArgs);
};
auto &&ElseGen = [OutlinedFn, CapturedVars, RTLoc, Loc](CodeGenFunction &CGF,
PrePostActionTy &) {
auto &RT = CGF.CGM.getOpenMPRuntime();
auto ThreadID = RT.getThreadID(CGF, Loc);
// Build calls:
// __kmpc_serialized_parallel(&Loc, GTid);
llvm::Value *Args[] = {RTLoc, ThreadID};
CGF.EmitRuntimeCall(
RT.createRuntimeFunction(OMPRTL__kmpc_serialized_parallel), Args);
// OutlinedFn(>id, &zero, CapturedStruct);
auto ThreadIDAddr = RT.emitThreadIDAddress(CGF, Loc);
Address ZeroAddr =
CGF.CreateTempAlloca(CGF.Int32Ty, CharUnits::fromQuantity(4),
/*Name*/ ".zero.addr");
CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0));
llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
OutlinedFnArgs.push_back(ThreadIDAddr.getPointer());
OutlinedFnArgs.push_back(ZeroAddr.getPointer());
OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());
CGF.EmitCallOrInvoke(OutlinedFn, OutlinedFnArgs);
// __kmpc_end_serialized_parallel(&Loc, GTid);
llvm::Value *EndArgs[] = {RT.emitUpdateLocation(CGF, Loc), ThreadID};
CGF.EmitRuntimeCall(
RT.createRuntimeFunction(OMPRTL__kmpc_end_serialized_parallel),
EndArgs);
};
if (IfCond)
emitOMPIfClause(CGF, IfCond, ThenGen, ElseGen);
else {
RegionCodeGenTy ThenRCG(ThenGen);
ThenRCG(CGF);
}
}
// If we're inside an (outlined) parallel region, use the region info's
// thread-ID variable (it is passed in a first argument of the outlined function
// as "kmp_int32 *gtid"). Otherwise, if we're not inside parallel region, but in
// regular serial code region, get thread ID by calling kmp_int32
// kmpc_global_thread_num(ident_t *loc), stash this thread ID in a temporary and
// return the address of that temp.
Address CGOpenMPRuntime::emitThreadIDAddress(CodeGenFunction &CGF,
SourceLocation Loc) {
if (auto *OMPRegionInfo =
dyn_cast_or_null<CGOpenMPRegionInfo>(CGF.CapturedStmtInfo))
if (OMPRegionInfo->getThreadIDVariable())
return OMPRegionInfo->getThreadIDVariableLValue(CGF).getAddress();
auto ThreadID = getThreadID(CGF, Loc);
auto Int32Ty =
CGF.getContext().getIntTypeForBitwidth(/*DestWidth*/ 32, /*Signed*/ true);
auto ThreadIDTemp = CGF.CreateMemTemp(Int32Ty, /*Name*/ ".threadid_temp.");
CGF.EmitStoreOfScalar(ThreadID,
CGF.MakeAddrLValue(ThreadIDTemp, Int32Ty));
return ThreadIDTemp;
}
llvm::Constant *
CGOpenMPRuntime::getOrCreateInternalVariable(llvm::Type *Ty,
const llvm::Twine &Name) {
SmallString<256> Buffer;
llvm::raw_svector_ostream Out(Buffer);
Out << Name;
auto RuntimeName = Out.str();
auto &Elem = *InternalVars.insert(std::make_pair(RuntimeName, nullptr)).first;
if (Elem.second) {
assert(Elem.second->getType()->getPointerElementType() == Ty &&
"OMP internal variable has different type than requested");
return &*Elem.second;
}
return Elem.second = new llvm::GlobalVariable(
CGM.getModule(), Ty, /*IsConstant*/ false,
llvm::GlobalValue::CommonLinkage, llvm::Constant::getNullValue(Ty),
Elem.first());
}
llvm::Value *CGOpenMPRuntime::getCriticalRegionLock(StringRef CriticalName) {
llvm::Twine Name(".gomp_critical_user_", CriticalName);
return getOrCreateInternalVariable(KmpCriticalNameTy, Name.concat(".var"));
}
namespace {
/// Common pre(post)-action for different OpenMP constructs.
class CommonActionTy final : public PrePostActionTy {
llvm::Value *EnterCallee;
ArrayRef<llvm::Value *> EnterArgs;
llvm::Value *ExitCallee;
ArrayRef<llvm::Value *> ExitArgs;
bool Conditional;
llvm::BasicBlock *ContBlock = nullptr;
public:
CommonActionTy(llvm::Value *EnterCallee, ArrayRef<llvm::Value *> EnterArgs,
llvm::Value *ExitCallee, ArrayRef<llvm::Value *> ExitArgs,
bool Conditional = false)
: EnterCallee(EnterCallee), EnterArgs(EnterArgs), ExitCallee(ExitCallee),
ExitArgs(ExitArgs), Conditional(Conditional) {}
void Enter(CodeGenFunction &CGF) override {
llvm::Value *EnterRes = CGF.EmitRuntimeCall(EnterCallee, EnterArgs);
if (Conditional) {
llvm::Value *CallBool = CGF.Builder.CreateIsNotNull(EnterRes);
auto *ThenBlock = CGF.createBasicBlock("omp_if.then");
ContBlock = CGF.createBasicBlock("omp_if.end");
// Generate the branch (If-stmt)
CGF.Builder.CreateCondBr(CallBool, ThenBlock, ContBlock);
CGF.EmitBlock(ThenBlock);
}
}
void Done(CodeGenFunction &CGF) {
// Emit the rest of blocks/branches
CGF.EmitBranch(ContBlock);
CGF.EmitBlock(ContBlock, true);
}
void Exit(CodeGenFunction &CGF) override {
CGF.EmitRuntimeCall(ExitCallee, ExitArgs);
}
};
} // anonymous namespace
void CGOpenMPRuntime::emitCriticalRegion(CodeGenFunction &CGF,
StringRef CriticalName,
const RegionCodeGenTy &CriticalOpGen,
SourceLocation Loc, const Expr *Hint) {
// __kmpc_critical[_with_hint](ident_t *, gtid, Lock[, hint]);
// CriticalOpGen();
// __kmpc_end_critical(ident_t *, gtid, Lock);
// Prepare arguments and build a call to __kmpc_critical
if (!CGF.HaveInsertPoint())
return;
llvm::Value *Args[] = {emitUpdateLocation(CGF, Loc), getThreadID(CGF, Loc),
getCriticalRegionLock(CriticalName)};
llvm::SmallVector<llvm::Value *, 4> EnterArgs(std::begin(Args),
std::end(Args));
if (Hint) {
EnterArgs.push_back(CGF.Builder.CreateIntCast(
CGF.EmitScalarExpr(Hint), CGM.IntPtrTy, /*isSigned=*/false));
}
CommonActionTy Action(
createRuntimeFunction(Hint ? OMPRTL__kmpc_critical_with_hint
: OMPRTL__kmpc_critical),
EnterArgs, createRuntimeFunction(OMPRTL__kmpc_end_critical), Args);
CriticalOpGen.setAction(Action);
emitInlinedDirective(CGF, OMPD_critical, CriticalOpGen);
}
void CGOpenMPRuntime::emitMasterRegion(CodeGenFunction &CGF,
const RegionCodeGenTy &MasterOpGen,
SourceLocation Loc) {
if (!CGF.HaveInsertPoint())
return;
// if(__kmpc_master(ident_t *, gtid)) {
// MasterOpGen();
// __kmpc_end_master(ident_t *, gtid);
// }
// Prepare arguments and build a call to __kmpc_master
llvm::Value *Args[] = {emitUpdateLocation(CGF, Loc), getThreadID(CGF, Loc)};
CommonActionTy Action(createRuntimeFunction(OMPRTL__kmpc_master), Args,
createRuntimeFunction(OMPRTL__kmpc_end_master), Args,
/*Conditional=*/true);
MasterOpGen.setAction(Action);
emitInlinedDirective(CGF, OMPD_master, MasterOpGen);
Action.Done(CGF);
}
void CGOpenMPRuntime::emitTaskyieldCall(CodeGenFunction &CGF,
SourceLocation Loc) {
if (!CGF.HaveInsertPoint())
return;
// Build call __kmpc_omp_taskyield(loc, thread_id, 0);
llvm::Value *Args[] = {
emitUpdateLocation(CGF, Loc), getThreadID(CGF, Loc),
llvm::ConstantInt::get(CGM.IntTy, /*V=*/0, /*isSigned=*/true)};
CGF.EmitRuntimeCall(createRuntimeFunction(OMPRTL__kmpc_omp_taskyield), Args);
if (auto *Region = dyn_cast_or_null<CGOpenMPRegionInfo>(CGF.CapturedStmtInfo))
Region->emitUntiedSwitch(CGF);
}
void CGOpenMPRuntime::emitTaskgroupRegion(CodeGenFunction &CGF,
const RegionCodeGenTy &TaskgroupOpGen,
SourceLocation Loc) {
if (!CGF.HaveInsertPoint())
return;
// __kmpc_taskgroup(ident_t *, gtid);
// TaskgroupOpGen();
// __kmpc_end_taskgroup(ident_t *, gtid);
// Prepare arguments and build a call to __kmpc_taskgroup
llvm::Value *Args[] = {emitUpdateLocation(CGF, Loc), getThreadID(CGF, Loc)};
CommonActionTy Action(createRuntimeFunction(OMPRTL__kmpc_taskgroup), Args,
createRuntimeFunction(OMPRTL__kmpc_end_taskgroup),
Args);
TaskgroupOpGen.setAction(Action);
emitInlinedDirective(CGF, OMPD_taskgroup, TaskgroupOpGen);
}
/// Given an array of pointers to variables, project the address of a
/// given variable.
static Address emitAddrOfVarFromArray(CodeGenFunction &CGF, Address Array,
unsigned Index, const VarDecl *Var) {
// Pull out the pointer to the variable.
Address PtrAddr =
CGF.Builder.CreateConstArrayGEP(Array, Index, CGF.getPointerSize());
llvm::Value *Ptr = CGF.Builder.CreateLoad(PtrAddr);
Address Addr = Address(Ptr, CGF.getContext().getDeclAlign(Var));
Addr = CGF.Builder.CreateElementBitCast(
Addr, CGF.ConvertTypeForMem(Var->getType()));
return Addr;
}
static llvm::Value *emitCopyprivateCopyFunction(
CodeGenModule &CGM, llvm::Type *ArgsType,
ArrayRef<const Expr *> CopyprivateVars, ArrayRef<const Expr *> DestExprs,
ArrayRef<const Expr *> SrcExprs, ArrayRef<const Expr *> AssignmentOps) {
auto &C = CGM.getContext();
// void copy_func(void *LHSArg, void *RHSArg);
FunctionArgList Args;
ImplicitParamDecl LHSArg(C, /*DC=*/nullptr, SourceLocation(), /*Id=*/nullptr,
C.VoidPtrTy);
ImplicitParamDecl RHSArg(C, /*DC=*/nullptr, SourceLocation(), /*Id=*/nullptr,
C.VoidPtrTy);
Args.push_back(&LHSArg);
Args.push_back(&RHSArg);
auto &CGFI = CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
auto *Fn = llvm::Function::Create(
CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
".omp.copyprivate.copy_func", &CGM.getModule());
CGM.SetInternalFunctionAttributes(/*D=*/nullptr, Fn, CGFI);
CodeGenFunction CGF(CGM);
CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args);
// Dest = (void*[n])(LHSArg);
// Src = (void*[n])(RHSArg);
Address LHS(CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
CGF.Builder.CreateLoad(CGF.GetAddrOfLocalVar(&LHSArg)),
ArgsType), CGF.getPointerAlign());
Address RHS(CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
CGF.Builder.CreateLoad(CGF.GetAddrOfLocalVar(&RHSArg)),
ArgsType), CGF.getPointerAlign());
// *(Type0*)Dst[0] = *(Type0*)Src[0];
// *(Type1*)Dst[1] = *(Type1*)Src[1];
// ...
// *(Typen*)Dst[n] = *(Typen*)Src[n];
for (unsigned I = 0, E = AssignmentOps.size(); I < E; ++I) {
auto DestVar = cast<VarDecl>(cast<DeclRefExpr>(DestExprs[I])->getDecl());
Address DestAddr = emitAddrOfVarFromArray(CGF, LHS, I, DestVar);
auto SrcVar = cast<VarDecl>(cast<DeclRefExpr>(SrcExprs[I])->getDecl());
Address SrcAddr = emitAddrOfVarFromArray(CGF, RHS, I, SrcVar);
auto *VD = cast<DeclRefExpr>(CopyprivateVars[I])->getDecl();
QualType Type = VD->getType();
CGF.EmitOMPCopy(Type, DestAddr, SrcAddr, DestVar, SrcVar, AssignmentOps[I]);
}
CGF.FinishFunction();
return Fn;
}
void CGOpenMPRuntime::emitSingleRegion(CodeGenFunction &CGF,
const RegionCodeGenTy &SingleOpGen,
SourceLocation Loc,
ArrayRef<const Expr *> CopyprivateVars,
ArrayRef<const Expr *> SrcExprs,
ArrayRef<const Expr *> DstExprs,
ArrayRef<const Expr *> AssignmentOps) {
if (!CGF.HaveInsertPoint())
return;
assert(CopyprivateVars.size() == SrcExprs.size() &&
CopyprivateVars.size() == DstExprs.size() &&
CopyprivateVars.size() == AssignmentOps.size());
auto &C = CGM.getContext();
// int32 did_it = 0;
// if(__kmpc_single(ident_t *, gtid)) {
// SingleOpGen();
// __kmpc_end_single(ident_t *, gtid);
// did_it = 1;
// }
// call __kmpc_copyprivate(ident_t *, gtid, <buf_size>, <copyprivate list>,
// <copy_func>, did_it);
Address DidIt = Address::invalid();
if (!CopyprivateVars.empty()) {
// int32 did_it = 0;
auto KmpInt32Ty = C.getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/1);
DidIt = CGF.CreateMemTemp(KmpInt32Ty, ".omp.copyprivate.did_it");
CGF.Builder.CreateStore(CGF.Builder.getInt32(0), DidIt);
}
// Prepare arguments and build a call to __kmpc_single
llvm::Value *Args[] = {emitUpdateLocation(CGF, Loc), getThreadID(CGF, Loc)};
CommonActionTy Action(createRuntimeFunction(OMPRTL__kmpc_single), Args,
createRuntimeFunction(OMPRTL__kmpc_end_single), Args,
/*Conditional=*/true);
SingleOpGen.setAction(Action);
emitInlinedDirective(CGF, OMPD_single, SingleOpGen);
if (DidIt.isValid()) {
// did_it = 1;
CGF.Builder.CreateStore(CGF.Builder.getInt32(1), DidIt);
}
Action.Done(CGF);
// call __kmpc_copyprivate(ident_t *, gtid, <buf_size>, <copyprivate list>,
// <copy_func>, did_it);
if (DidIt.isValid()) {
llvm::APInt ArraySize(/*unsigned int numBits=*/32, CopyprivateVars.size());
auto CopyprivateArrayTy =
C.getConstantArrayType(C.VoidPtrTy, ArraySize, ArrayType::Normal,
/*IndexTypeQuals=*/0);
// Create a list of all private variables for copyprivate.
Address CopyprivateList =
CGF.CreateMemTemp(CopyprivateArrayTy, ".omp.copyprivate.cpr_list");
for (unsigned I = 0, E = CopyprivateVars.size(); I < E; ++I) {
Address Elem = CGF.Builder.CreateConstArrayGEP(
CopyprivateList, I, CGF.getPointerSize());
CGF.Builder.CreateStore(
CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
CGF.EmitLValue(CopyprivateVars[I]).getPointer(), CGF.VoidPtrTy),
Elem);
}
// Build function that copies private values from single region to all other
// threads in the corresponding parallel region.
auto *CpyFn = emitCopyprivateCopyFunction(
CGM, CGF.ConvertTypeForMem(CopyprivateArrayTy)->getPointerTo(),
CopyprivateVars, SrcExprs, DstExprs, AssignmentOps);
auto *BufSize = CGF.getTypeSize(CopyprivateArrayTy);
Address CL =
CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(CopyprivateList,
CGF.VoidPtrTy);
auto *DidItVal = CGF.Builder.CreateLoad(DidIt);
llvm::Value *Args[] = {
emitUpdateLocation(CGF, Loc), // ident_t *<loc>
getThreadID(CGF, Loc), // i32 <gtid>
BufSize, // size_t <buf_size>
CL.getPointer(), // void *<copyprivate list>
CpyFn, // void (*) (void *, void *) <copy_func>
DidItVal // i32 did_it
};
CGF.EmitRuntimeCall(createRuntimeFunction(OMPRTL__kmpc_copyprivate), Args);
}
}
void CGOpenMPRuntime::emitOrderedRegion(CodeGenFunction &CGF,
const RegionCodeGenTy &OrderedOpGen,
SourceLocation Loc, bool IsThreads) {
if (!CGF.HaveInsertPoint())
return;
// __kmpc_ordered(ident_t *, gtid);
// OrderedOpGen();
// __kmpc_end_ordered(ident_t *, gtid);
// Prepare arguments and build a call to __kmpc_ordered
if (IsThreads) {
llvm::Value *Args[] = {emitUpdateLocation(CGF, Loc), getThreadID(CGF, Loc)};
CommonActionTy Action(createRuntimeFunction(OMPRTL__kmpc_ordered), Args,
createRuntimeFunction(OMPRTL__kmpc_end_ordered),
Args);
OrderedOpGen.setAction(Action);
emitInlinedDirective(CGF, OMPD_ordered, OrderedOpGen);
return;
}
emitInlinedDirective(CGF, OMPD_ordered, OrderedOpGen);
}
void CGOpenMPRuntime::emitBarrierCall(CodeGenFunction &CGF, SourceLocation Loc,
OpenMPDirectiveKind Kind, bool EmitChecks,
bool ForceSimpleCall) {
if (!CGF.HaveInsertPoint())
return;
// Build call __kmpc_cancel_barrier(loc, thread_id);
// Build call __kmpc_barrier(loc, thread_id);
unsigned Flags;
if (Kind == OMPD_for)
Flags = OMP_IDENT_BARRIER_IMPL_FOR;
else if (Kind == OMPD_sections)
Flags = OMP_IDENT_BARRIER_IMPL_SECTIONS;
else if (Kind == OMPD_single)
Flags = OMP_IDENT_BARRIER_IMPL_SINGLE;
else if (Kind == OMPD_barrier)
Flags = OMP_IDENT_BARRIER_EXPL;
else
Flags = OMP_IDENT_BARRIER_IMPL;
// Build call __kmpc_cancel_barrier(loc, thread_id) or __kmpc_barrier(loc,
// thread_id);
llvm::Value *Args[] = {emitUpdateLocation(CGF, Loc, Flags),
getThreadID(CGF, Loc)};
if (auto *OMPRegionInfo =
dyn_cast_or_null<CGOpenMPRegionInfo>(CGF.CapturedStmtInfo)) {
if (!ForceSimpleCall && OMPRegionInfo->hasCancel()) {
auto *Result = CGF.EmitRuntimeCall(
createRuntimeFunction(OMPRTL__kmpc_cancel_barrier), Args);
if (EmitChecks) {
// if (__kmpc_cancel_barrier()) {
// exit from construct;
// }
auto *ExitBB = CGF.createBasicBlock(".cancel.exit");
auto *ContBB = CGF.createBasicBlock(".cancel.continue");
auto *Cmp = CGF.Builder.CreateIsNotNull(Result);
CGF.Builder.CreateCondBr(Cmp, ExitBB, ContBB);
CGF.EmitBlock(ExitBB);
// exit from construct;
auto CancelDestination =
CGF.getOMPCancelDestination(OMPRegionInfo->getDirectiveKind());
CGF.EmitBranchThroughCleanup(CancelDestination);
CGF.EmitBlock(ContBB, /*IsFinished=*/true);
}
return;
}
}
CGF.EmitRuntimeCall(createRuntimeFunction(OMPRTL__kmpc_barrier), Args);
}
/// \brief Map the OpenMP loop schedule to the runtime enumeration.
static OpenMPSchedType getRuntimeSchedule(OpenMPScheduleClauseKind ScheduleKind,
bool Chunked, bool Ordered) {
switch (ScheduleKind) {
case OMPC_SCHEDULE_static:
return Chunked ? (Ordered ? OMP_ord_static_chunked : OMP_sch_static_chunked)
: (Ordered ? OMP_ord_static : OMP_sch_static);
case OMPC_SCHEDULE_dynamic:
return Ordered ? OMP_ord_dynamic_chunked : OMP_sch_dynamic_chunked;
case OMPC_SCHEDULE_guided:
return Ordered ? OMP_ord_guided_chunked : OMP_sch_guided_chunked;
case OMPC_SCHEDULE_runtime:
return Ordered ? OMP_ord_runtime : OMP_sch_runtime;
case OMPC_SCHEDULE_auto:
return Ordered ? OMP_ord_auto : OMP_sch_auto;
case OMPC_SCHEDULE_unknown:
assert(!Chunked && "chunk was specified but schedule kind not known");
return Ordered ? OMP_ord_static : OMP_sch_static;
}
llvm_unreachable("Unexpected runtime schedule");
}
/// \brief Map the OpenMP distribute schedule to the runtime enumeration.
static OpenMPSchedType
getRuntimeSchedule(OpenMPDistScheduleClauseKind ScheduleKind, bool Chunked) {
// only static is allowed for dist_schedule
return Chunked ? OMP_dist_sch_static_chunked : OMP_dist_sch_static;
}
bool CGOpenMPRuntime::isStaticNonchunked(OpenMPScheduleClauseKind ScheduleKind,
bool Chunked) const {
auto Schedule = getRuntimeSchedule(ScheduleKind, Chunked, /*Ordered=*/false);
return Schedule == OMP_sch_static;
}
bool CGOpenMPRuntime::isStaticNonchunked(
OpenMPDistScheduleClauseKind ScheduleKind, bool Chunked) const {
auto Schedule = getRuntimeSchedule(ScheduleKind, Chunked);
return Schedule == OMP_dist_sch_static;
}
bool CGOpenMPRuntime::isDynamic(OpenMPScheduleClauseKind ScheduleKind) const {
auto Schedule =
getRuntimeSchedule(ScheduleKind, /*Chunked=*/false, /*Ordered=*/false);
assert(Schedule != OMP_sch_static_chunked && "cannot be chunked here");
return Schedule != OMP_sch_static;
}
static int addMonoNonMonoModifier(OpenMPSchedType Schedule,
OpenMPScheduleClauseModifier M1,
OpenMPScheduleClauseModifier M2) {
int Modifier = 0;
switch (M1) {
case OMPC_SCHEDULE_MODIFIER_monotonic:
Modifier = OMP_sch_modifier_monotonic;
break;
case OMPC_SCHEDULE_MODIFIER_nonmonotonic:
Modifier = OMP_sch_modifier_nonmonotonic;
break;
case OMPC_SCHEDULE_MODIFIER_simd:
if (Schedule == OMP_sch_static_chunked)
Schedule = OMP_sch_static_balanced_chunked;
break;
case OMPC_SCHEDULE_MODIFIER_last:
case OMPC_SCHEDULE_MODIFIER_unknown:
break;
}
switch (M2) {
case OMPC_SCHEDULE_MODIFIER_monotonic:
Modifier = OMP_sch_modifier_monotonic;
break;
case OMPC_SCHEDULE_MODIFIER_nonmonotonic:
Modifier = OMP_sch_modifier_nonmonotonic;
break;
case OMPC_SCHEDULE_MODIFIER_simd:
if (Schedule == OMP_sch_static_chunked)
Schedule = OMP_sch_static_balanced_chunked;
break;
case OMPC_SCHEDULE_MODIFIER_last:
case OMPC_SCHEDULE_MODIFIER_unknown:
break;
}
return Schedule | Modifier;
}
void CGOpenMPRuntime::emitForDispatchInit(CodeGenFunction &CGF,
SourceLocation Loc,
const OpenMPScheduleTy &ScheduleKind,
unsigned IVSize, bool IVSigned,
bool Ordered, llvm::Value *UB,
llvm::Value *Chunk) {
if (!CGF.HaveInsertPoint())
return;
OpenMPSchedType Schedule =
getRuntimeSchedule(ScheduleKind.Schedule, Chunk != nullptr, Ordered);
assert(Ordered ||
(Schedule != OMP_sch_static && Schedule != OMP_sch_static_chunked &&
Schedule != OMP_ord_static && Schedule != OMP_ord_static_chunked &&
Schedule != OMP_sch_static_balanced_chunked));
// Call __kmpc_dispatch_init(
// ident_t *loc, kmp_int32 tid, kmp_int32 schedule,
// kmp_int[32|64] lower, kmp_int[32|64] upper,
// kmp_int[32|64] stride, kmp_int[32|64] chunk);
// If the Chunk was not specified in the clause - use default value 1.
if (Chunk == nullptr)
Chunk = CGF.Builder.getIntN(IVSize, 1);
llvm::Value *Args[] = {
emitUpdateLocation(CGF, Loc), getThreadID(CGF, Loc),
CGF.Builder.getInt32(addMonoNonMonoModifier(
Schedule, ScheduleKind.M1, ScheduleKind.M2)), // Schedule type
CGF.Builder.getIntN(IVSize, 0), // Lower
UB, // Upper
CGF.Builder.getIntN(IVSize, 1), // Stride
Chunk // Chunk
};
CGF.EmitRuntimeCall(createDispatchInitFunction(IVSize, IVSigned), Args);
}
static void emitForStaticInitCall(
CodeGenFunction &CGF, llvm::Value *UpdateLocation, llvm::Value *ThreadId,
llvm::Constant *ForStaticInitFunction, OpenMPSchedType Schedule,
OpenMPScheduleClauseModifier M1, OpenMPScheduleClauseModifier M2,
unsigned IVSize, bool Ordered, Address IL, Address LB, Address UB,
Address ST, llvm::Value *Chunk) {
if (!CGF.HaveInsertPoint())
return;
assert(!Ordered);
assert(Schedule == OMP_sch_static || Schedule == OMP_sch_static_chunked ||
Schedule == OMP_sch_static_balanced_chunked ||
Schedule == OMP_ord_static || Schedule == OMP_ord_static_chunked ||
Schedule == OMP_dist_sch_static ||
Schedule == OMP_dist_sch_static_chunked);
// Call __kmpc_for_static_init(
// ident_t *loc, kmp_int32 tid, kmp_int32 schedtype,
// kmp_int32 *p_lastiter, kmp_int[32|64] *p_lower,
// kmp_int[32|64] *p_upper, kmp_int[32|64] *p_stride,
// kmp_int[32|64] incr, kmp_int[32|64] chunk);
if (Chunk == nullptr) {
assert((Schedule == OMP_sch_static || Schedule == OMP_ord_static ||
Schedule == OMP_dist_sch_static) &&
"expected static non-chunked schedule");
// If the Chunk was not specified in the clause - use default value 1.
Chunk = CGF.Builder.getIntN(IVSize, 1);
} else {
assert((Schedule == OMP_sch_static_chunked ||
Schedule == OMP_sch_static_balanced_chunked ||
Schedule == OMP_ord_static_chunked ||
Schedule == OMP_dist_sch_static_chunked) &&
"expected static chunked schedule");
}
llvm::Value *Args[] = {
UpdateLocation, ThreadId, CGF.Builder.getInt32(addMonoNonMonoModifier(
Schedule, M1, M2)), // Schedule type
IL.getPointer(), // &isLastIter
LB.getPointer(), // &LB
UB.getPointer(), // &UB
ST.getPointer(), // &Stride
CGF.Builder.getIntN(IVSize, 1), // Incr
Chunk // Chunk
};
CGF.EmitRuntimeCall(ForStaticInitFunction, Args);
}
void CGOpenMPRuntime::emitForStaticInit(CodeGenFunction &CGF,
SourceLocation Loc,
const OpenMPScheduleTy &ScheduleKind,
unsigned IVSize, bool IVSigned,
bool Ordered, Address IL, Address LB,
Address UB, Address ST,
llvm::Value *Chunk) {
OpenMPSchedType ScheduleNum =
getRuntimeSchedule(ScheduleKind.Schedule, Chunk != nullptr, Ordered);
auto *UpdatedLocation = emitUpdateLocation(CGF, Loc);
auto *ThreadId = getThreadID(CGF, Loc);
auto *StaticInitFunction = createForStaticInitFunction(IVSize, IVSigned);
emitForStaticInitCall(CGF, UpdatedLocation, ThreadId, StaticInitFunction,
ScheduleNum, ScheduleKind.M1, ScheduleKind.M2, IVSize,
Ordered, IL, LB, UB, ST, Chunk);
}
void CGOpenMPRuntime::emitDistributeStaticInit(
CodeGenFunction &CGF, SourceLocation Loc,
OpenMPDistScheduleClauseKind SchedKind, unsigned IVSize, bool IVSigned,
bool Ordered, Address IL, Address LB, Address UB, Address ST,
llvm::Value *Chunk) {
OpenMPSchedType ScheduleNum = getRuntimeSchedule(SchedKind, Chunk != nullptr);
auto *UpdatedLocation = emitUpdateLocation(CGF, Loc);
auto *ThreadId = getThreadID(CGF, Loc);
auto *StaticInitFunction = createForStaticInitFunction(IVSize, IVSigned);
emitForStaticInitCall(CGF, UpdatedLocation, ThreadId, StaticInitFunction,
ScheduleNum, OMPC_SCHEDULE_MODIFIER_unknown,
OMPC_SCHEDULE_MODIFIER_unknown, IVSize, Ordered, IL, LB,
UB, ST, Chunk);
}
void CGOpenMPRuntime::emitForStaticFinish(CodeGenFunction &CGF,
SourceLocation Loc) {
if (!CGF.HaveInsertPoint())
return;
// Call __kmpc_for_static_fini(ident_t *loc, kmp_int32 tid);
llvm::Value *Args[] = {emitUpdateLocation(CGF, Loc), getThreadID(CGF, Loc)};
CGF.EmitRuntimeCall(createRuntimeFunction(OMPRTL__kmpc_for_static_fini),
Args);
}
void CGOpenMPRuntime::emitForOrderedIterationEnd(CodeGenFunction &CGF,
SourceLocation Loc,
unsigned IVSize,
bool IVSigned) {
if (!CGF.HaveInsertPoint())
return;
// Call __kmpc_for_dynamic_fini_(4|8)[u](ident_t *loc, kmp_int32 tid);
llvm::Value *Args[] = {emitUpdateLocation(CGF, Loc), getThreadID(CGF, Loc)};
CGF.EmitRuntimeCall(createDispatchFiniFunction(IVSize, IVSigned), Args);
}
llvm::Value *CGOpenMPRuntime::emitForNext(CodeGenFunction &CGF,
SourceLocation Loc, unsigned IVSize,
bool IVSigned, Address IL,
Address LB, Address UB,
Address ST) {
// Call __kmpc_dispatch_next(
// ident_t *loc, kmp_int32 tid, kmp_int32 *p_lastiter,
// kmp_int[32|64] *p_lower, kmp_int[32|64] *p_upper,
// kmp_int[32|64] *p_stride);
llvm::Value *Args[] = {
emitUpdateLocation(CGF, Loc),
getThreadID(CGF, Loc),
IL.getPointer(), // &isLastIter
LB.getPointer(), // &Lower
UB.getPointer(), // &Upper
ST.getPointer() // &Stride
};
llvm::Value *Call =
CGF.EmitRuntimeCall(createDispatchNextFunction(IVSize, IVSigned), Args);
return CGF.EmitScalarConversion(
Call, CGF.getContext().getIntTypeForBitwidth(32, /* Signed */ true),
CGF.getContext().BoolTy, Loc);
}
void CGOpenMPRuntime::emitNumThreadsClause(CodeGenFunction &CGF,
llvm::Value *NumThreads,
SourceLocation Loc) {
if (!CGF.HaveInsertPoint())
return;
// Build call __kmpc_push_num_threads(&loc, global_tid, num_threads)
llvm::Value *Args[] = {
emitUpdateLocation(CGF, Loc), getThreadID(CGF, Loc),
CGF.Builder.CreateIntCast(NumThreads, CGF.Int32Ty, /*isSigned*/ true)};
CGF.EmitRuntimeCall(createRuntimeFunction(OMPRTL__kmpc_push_num_threads),
Args);
}
void CGOpenMPRuntime::emitProcBindClause(CodeGenFunction &CGF,
OpenMPProcBindClauseKind ProcBind,
SourceLocation Loc) {
if (!CGF.HaveInsertPoint())
return;
// Constants for proc bind value accepted by the runtime.
enum ProcBindTy {
ProcBindFalse = 0,
ProcBindTrue,
ProcBindMaster,
ProcBindClose,
ProcBindSpread,
ProcBindIntel,
ProcBindDefault
} RuntimeProcBind;
switch (ProcBind) {
case OMPC_PROC_BIND_master:
RuntimeProcBind = ProcBindMaster;
break;
case OMPC_PROC_BIND_close:
RuntimeProcBind = ProcBindClose;
break;
case OMPC_PROC_BIND_spread:
RuntimeProcBind = ProcBindSpread;
break;
case OMPC_PROC_BIND_unknown:
llvm_unreachable("Unsupported proc_bind value.");
}
// Build call __kmpc_push_proc_bind(&loc, global_tid, proc_bind)
llvm::Value *Args[] = {
emitUpdateLocation(CGF, Loc), getThreadID(CGF, Loc),
llvm::ConstantInt::get(CGM.IntTy, RuntimeProcBind, /*isSigned=*/true)};
CGF.EmitRuntimeCall(createRuntimeFunction(OMPRTL__kmpc_push_proc_bind), Args);
}
void CGOpenMPRuntime::emitFlush(CodeGenFunction &CGF, ArrayRef<const Expr *>,
SourceLocation Loc) {
if (!CGF.HaveInsertPoint())
return;
// Build call void __kmpc_flush(ident_t *loc)
CGF.EmitRuntimeCall(createRuntimeFunction(OMPRTL__kmpc_flush),
emitUpdateLocation(CGF, Loc));
}
namespace {
/// \brief Indexes of fields for type kmp_task_t.
enum KmpTaskTFields {
/// \brief List of shared variables.
KmpTaskTShareds,
/// \brief Task routine.
KmpTaskTRoutine,
/// \brief Partition id for the untied tasks.
KmpTaskTPartId,
/// Function with call of destructors for private variables.
Data1,
/// Task priority.
Data2,
/// (Taskloops only) Lower bound.
KmpTaskTLowerBound,
/// (Taskloops only) Upper bound.
KmpTaskTUpperBound,
/// (Taskloops only) Stride.
KmpTaskTStride,
/// (Taskloops only) Is last iteration flag.
KmpTaskTLastIter,
};
} // anonymous namespace
bool CGOpenMPRuntime::OffloadEntriesInfoManagerTy::empty() const {
// FIXME: Add other entries type when they become supported.
return OffloadEntriesTargetRegion.empty();
}
/// \brief Initialize target region entry.
void CGOpenMPRuntime::OffloadEntriesInfoManagerTy::
initializeTargetRegionEntryInfo(unsigned DeviceID, unsigned FileID,
StringRef ParentName, unsigned LineNum,
unsigned Order) {
assert(CGM.getLangOpts().OpenMPIsDevice && "Initialization of entries is "
"only required for the device "
"code generation.");
OffloadEntriesTargetRegion[DeviceID][FileID][ParentName][LineNum] =
OffloadEntryInfoTargetRegion(Order, /*Addr=*/nullptr, /*ID=*/nullptr,
/*Flags=*/0);
++OffloadingEntriesNum;
}
void CGOpenMPRuntime::OffloadEntriesInfoManagerTy::
registerTargetRegionEntryInfo(unsigned DeviceID, unsigned FileID,
StringRef ParentName, unsigned LineNum,
llvm::Constant *Addr, llvm::Constant *ID,
int32_t Flags) {
// If we are emitting code for a target, the entry is already initialized,
// only has to be registered.
if (CGM.getLangOpts().OpenMPIsDevice) {
assert(hasTargetRegionEntryInfo(DeviceID, FileID, ParentName, LineNum) &&
"Entry must exist.");
auto &Entry =
OffloadEntriesTargetRegion[DeviceID][FileID][ParentName][LineNum];
assert(Entry.isValid() && "Entry not initialized!");
Entry.setAddress(Addr);
Entry.setID(ID);
Entry.setFlags(Flags);
return;
} else {
OffloadEntryInfoTargetRegion Entry(OffloadingEntriesNum++, Addr, ID, Flags);
OffloadEntriesTargetRegion[DeviceID][FileID][ParentName][LineNum] = Entry;
}
}
bool CGOpenMPRuntime::OffloadEntriesInfoManagerTy::hasTargetRegionEntryInfo(
unsigned DeviceID, unsigned FileID, StringRef ParentName,
unsigned LineNum) const {
auto PerDevice = OffloadEntriesTargetRegion.find(DeviceID);
if (PerDevice == OffloadEntriesTargetRegion.end())
return false;
auto PerFile = PerDevice->second.find(FileID);
if (PerFile == PerDevice->second.end())
return false;
auto PerParentName = PerFile->second.find(ParentName);
if (PerParentName == PerFile->second.end())
return false;
auto PerLine = PerParentName->second.find(LineNum);
if (PerLine == PerParentName->second.end())
return false;
// Fail if this entry is already registered.
if (PerLine->second.getAddress() || PerLine->second.getID())
return false;
return true;
}
void CGOpenMPRuntime::OffloadEntriesInfoManagerTy::actOnTargetRegionEntriesInfo(
const OffloadTargetRegionEntryInfoActTy &Action) {
// Scan all target region entries and perform the provided action.
for (auto &D : OffloadEntriesTargetRegion)
for (auto &F : D.second)
for (auto &P : F.second)
for (auto &L : P.second)
Action(D.first, F.first, P.first(), L.first, L.second);
}
/// \brief Create a Ctor/Dtor-like function whose body is emitted through
/// \a Codegen. This is used to emit the two functions that register and
/// unregister the descriptor of the current compilation unit.
static llvm::Function *
createOffloadingBinaryDescriptorFunction(CodeGenModule &CGM, StringRef Name,
const RegionCodeGenTy &Codegen) {
auto &C = CGM.getContext();
FunctionArgList Args;
ImplicitParamDecl DummyPtr(C, /*DC=*/nullptr, SourceLocation(),
/*Id=*/nullptr, C.VoidPtrTy);
Args.push_back(&DummyPtr);
CodeGenFunction CGF(CGM);
auto &FI = CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
auto FTy = CGM.getTypes().GetFunctionType(FI);
auto *Fn =
CGM.CreateGlobalInitOrDestructFunction(FTy, Name, FI, SourceLocation());
CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, FI, Args, SourceLocation());
Codegen(CGF);
CGF.FinishFunction();
return Fn;
}
llvm::Function *
CGOpenMPRuntime::createOffloadingBinaryDescriptorRegistration() {
// If we don't have entries or if we are emitting code for the device, we
// don't need to do anything.
if (CGM.getLangOpts().OpenMPIsDevice || OffloadEntriesInfoManager.empty())
return nullptr;
auto &M = CGM.getModule();
auto &C = CGM.getContext();
// Get list of devices we care about
auto &Devices = CGM.getLangOpts().OMPTargetTriples;
// We should be creating an offloading descriptor only if there are devices
// specified.
assert(!Devices.empty() && "No OpenMP offloading devices??");
// Create the external variables that will point to the begin and end of the
// host entries section. These will be defined by the linker.
auto *OffloadEntryTy =
CGM.getTypes().ConvertTypeForMem(getTgtOffloadEntryQTy());
llvm::GlobalVariable *HostEntriesBegin = new llvm::GlobalVariable(
M, OffloadEntryTy, /*isConstant=*/true,
llvm::GlobalValue::ExternalLinkage, /*Initializer=*/nullptr,
".omp_offloading.entries_begin");
llvm::GlobalVariable *HostEntriesEnd = new llvm::GlobalVariable(
M, OffloadEntryTy, /*isConstant=*/true,
llvm::GlobalValue::ExternalLinkage, /*Initializer=*/nullptr,
".omp_offloading.entries_end");
// Create all device images
auto *DeviceImageTy = cast<llvm::StructType>(
CGM.getTypes().ConvertTypeForMem(getTgtDeviceImageQTy()));
ConstantInitBuilder DeviceImagesBuilder(CGM);
auto DeviceImagesEntries = DeviceImagesBuilder.beginArray(DeviceImageTy);
for (unsigned i = 0; i < Devices.size(); ++i) {
StringRef T = Devices[i].getTriple();
auto *ImgBegin = new llvm::GlobalVariable(
M, CGM.Int8Ty, /*isConstant=*/true, llvm::GlobalValue::ExternalLinkage,
/*Initializer=*/nullptr,
Twine(".omp_offloading.img_start.") + Twine(T));
auto *ImgEnd = new llvm::GlobalVariable(
M, CGM.Int8Ty, /*isConstant=*/true, llvm::GlobalValue::ExternalLinkage,
/*Initializer=*/nullptr, Twine(".omp_offloading.img_end.") + Twine(T));
auto Dev = DeviceImagesEntries.beginStruct(DeviceImageTy);
Dev.add(ImgBegin);
Dev.add(ImgEnd);
Dev.add(HostEntriesBegin);
Dev.add(HostEntriesEnd);
Dev.finishAndAddTo(DeviceImagesEntries);
}
// Create device images global array.
llvm::GlobalVariable *DeviceImages =
DeviceImagesEntries.finishAndCreateGlobal(".omp_offloading.device_images",
CGM.getPointerAlign(),
/*isConstant=*/true);
DeviceImages->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global);
// This is a Zero array to be used in the creation of the constant expressions
llvm::Constant *Index[] = {llvm::Constant::getNullValue(CGM.Int32Ty),
llvm::Constant::getNullValue(CGM.Int32Ty)};
// Create the target region descriptor.
auto *BinaryDescriptorTy = cast<llvm::StructType>(
CGM.getTypes().ConvertTypeForMem(getTgtBinaryDescriptorQTy()));
ConstantInitBuilder DescBuilder(CGM);
auto DescInit = DescBuilder.beginStruct(BinaryDescriptorTy);
DescInit.addInt(CGM.Int32Ty, Devices.size());
DescInit.add(llvm::ConstantExpr::getGetElementPtr(DeviceImages->getValueType(),
DeviceImages,
Index));
DescInit.add(HostEntriesBegin);
DescInit.add(HostEntriesEnd);
auto *Desc = DescInit.finishAndCreateGlobal(".omp_offloading.descriptor",
CGM.getPointerAlign(),
/*isConstant=*/true);
// Emit code to register or unregister the descriptor at execution
// startup or closing, respectively.
// Create a variable to drive the registration and unregistration of the
// descriptor, so we can reuse the logic that emits Ctors and Dtors.
auto *IdentInfo = &C.Idents.get(".omp_offloading.reg_unreg_var");
ImplicitParamDecl RegUnregVar(C, C.getTranslationUnitDecl(), SourceLocation(),
IdentInfo, C.CharTy);
auto *UnRegFn = createOffloadingBinaryDescriptorFunction(
CGM, ".omp_offloading.descriptor_unreg",
[&](CodeGenFunction &CGF, PrePostActionTy &) {
CGF.EmitCallOrInvoke(createRuntimeFunction(OMPRTL__tgt_unregister_lib),
Desc);
});
auto *RegFn = createOffloadingBinaryDescriptorFunction(
CGM, ".omp_offloading.descriptor_reg",
[&](CodeGenFunction &CGF, PrePostActionTy &) {
CGF.EmitCallOrInvoke(createRuntimeFunction(OMPRTL__tgt_register_lib),
Desc);
CGM.getCXXABI().registerGlobalDtor(CGF, RegUnregVar, UnRegFn, Desc);
});
return RegFn;
}
void CGOpenMPRuntime::createOffloadEntry(llvm::Constant *ID,
llvm::Constant *Addr, uint64_t Size,
int32_t Flags) {
StringRef Name = Addr->getName();
auto *TgtOffloadEntryType = cast<llvm::StructType>(
CGM.getTypes().ConvertTypeForMem(getTgtOffloadEntryQTy()));
llvm::LLVMContext &C = CGM.getModule().getContext();
llvm::Module &M = CGM.getModule();
// Make sure the address has the right type.
llvm::Constant *AddrPtr = llvm::ConstantExpr::getBitCast(ID, CGM.VoidPtrTy);
// Create constant string with the name.
llvm::Constant *StrPtrInit = llvm::ConstantDataArray::getString(C, Name);
llvm::GlobalVariable *Str =
new llvm::GlobalVariable(M, StrPtrInit->getType(), /*isConstant=*/true,
llvm::GlobalValue::InternalLinkage, StrPtrInit,
".omp_offloading.entry_name");
Str->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global);
llvm::Constant *StrPtr = llvm::ConstantExpr::getBitCast(Str, CGM.Int8PtrTy);
// We can't have any padding between symbols, so we need to have 1-byte
// alignment.
auto Align = CharUnits::fromQuantity(1);
// Create the entry struct.
ConstantInitBuilder EntryBuilder(CGM);
auto EntryInit = EntryBuilder.beginStruct(TgtOffloadEntryType);
EntryInit.add(AddrPtr);
EntryInit.add(StrPtr);
EntryInit.addInt(CGM.SizeTy, Size);
EntryInit.addInt(CGM.Int32Ty, Flags);
EntryInit.addInt(CGM.Int32Ty, 0);
llvm::GlobalVariable *Entry =
EntryInit.finishAndCreateGlobal(".omp_offloading.entry",
Align,
/*constant*/ true,
llvm::GlobalValue::ExternalLinkage);
// The entry has to be created in the section the linker expects it to be.
Entry->setSection(".omp_offloading.entries");
}
void CGOpenMPRuntime::createOffloadEntriesAndInfoMetadata() {
// Emit the offloading entries and metadata so that the device codegen side
// can easily figure out what to emit. The produced metadata looks like
// this:
//
// !omp_offload.info = !{!1, ...}
//
// Right now we only generate metadata for function that contain target
// regions.
// If we do not have entries, we dont need to do anything.
if (OffloadEntriesInfoManager.empty())
return;
llvm::Module &M = CGM.getModule();
llvm::LLVMContext &C = M.getContext();
SmallVector<OffloadEntriesInfoManagerTy::OffloadEntryInfo *, 16>
OrderedEntries(OffloadEntriesInfoManager.size());
// Create the offloading info metadata node.
llvm::NamedMDNode *MD = M.getOrInsertNamedMetadata("omp_offload.info");
// Auxiliar methods to create metadata values and strings.
auto getMDInt = [&](unsigned v) {
return llvm::ConstantAsMetadata::get(
llvm::ConstantInt::get(llvm::Type::getInt32Ty(C), v));
};
auto getMDString = [&](StringRef v) { return llvm::MDString::get(C, v); };
// Create function that emits metadata for each target region entry;
auto &&TargetRegionMetadataEmitter = [&](
unsigned DeviceID, unsigned FileID, StringRef ParentName, unsigned Line,
OffloadEntriesInfoManagerTy::OffloadEntryInfoTargetRegion &E) {
llvm::SmallVector<llvm::Metadata *, 32> Ops;
// Generate metadata for target regions. Each entry of this metadata
// contains:
// - Entry 0 -> Kind of this type of metadata (0).
// - Entry 1 -> Device ID of the file where the entry was identified.
// - Entry 2 -> File ID of the file where the entry was identified.
// - Entry 3 -> Mangled name of the function where the entry was identified.
// - Entry 4 -> Line in the file where the entry was identified.
// - Entry 5 -> Order the entry was created.
// The first element of the metadata node is the kind.
Ops.push_back(getMDInt(E.getKind()));
Ops.push_back(getMDInt(DeviceID));
Ops.push_back(getMDInt(FileID));
Ops.push_back(getMDString(ParentName));
Ops.push_back(getMDInt(Line));
Ops.push_back(getMDInt(E.getOrder()));
// Save this entry in the right position of the ordered entries array.
OrderedEntries[E.getOrder()] = &E;
// Add metadata to the named metadata node.
MD->addOperand(llvm::MDNode::get(C, Ops));
};
OffloadEntriesInfoManager.actOnTargetRegionEntriesInfo(
TargetRegionMetadataEmitter);
for (auto *E : OrderedEntries) {
assert(E && "All ordered entries must exist!");
if (auto *CE =
dyn_cast<OffloadEntriesInfoManagerTy::OffloadEntryInfoTargetRegion>(
E)) {
assert(CE->getID() && CE->getAddress() &&
"Entry ID and Addr are invalid!");
createOffloadEntry(CE->getID(), CE->getAddress(), /*Size=*/0);
} else
llvm_unreachable("Unsupported entry kind.");
}
}
/// \brief Loads all the offload entries information from the host IR
/// metadata.
void CGOpenMPRuntime::loadOffloadInfoMetadata() {
// If we are in target mode, load the metadata from the host IR. This code has
// to match the metadaata creation in createOffloadEntriesAndInfoMetadata().
if (!CGM.getLangOpts().OpenMPIsDevice)
return;
if (CGM.getLangOpts().OMPHostIRFile.empty())
return;
auto Buf = llvm::MemoryBuffer::getFile(CGM.getLangOpts().OMPHostIRFile);
if (Buf.getError())
return;
llvm::LLVMContext C;
auto ME = expectedToErrorOrAndEmitErrors(
C, llvm::parseBitcodeFile(Buf.get()->getMemBufferRef(), C));
if (ME.getError())
return;
llvm::NamedMDNode *MD = ME.get()->getNamedMetadata("omp_offload.info");
if (!MD)
return;
for (auto I : MD->operands()) {
llvm::MDNode *MN = cast<llvm::MDNode>(I);
auto getMDInt = [&](unsigned Idx) {
llvm::ConstantAsMetadata *V =
cast<llvm::ConstantAsMetadata>(MN->getOperand(Idx));
return cast<llvm::ConstantInt>(V->getValue())->getZExtValue();
};
auto getMDString = [&](unsigned Idx) {
llvm::MDString *V = cast<llvm::MDString>(MN->getOperand(Idx));
return V->getString();
};
switch (getMDInt(0)) {
default:
llvm_unreachable("Unexpected metadata!");
break;
case OffloadEntriesInfoManagerTy::OffloadEntryInfo::
OFFLOAD_ENTRY_INFO_TARGET_REGION:
OffloadEntriesInfoManager.initializeTargetRegionEntryInfo(
/*DeviceID=*/getMDInt(1), /*FileID=*/getMDInt(2),
/*ParentName=*/getMDString(3), /*Line=*/getMDInt(4),
/*Order=*/getMDInt(5));
break;
}
}
}
void CGOpenMPRuntime::emitKmpRoutineEntryT(QualType KmpInt32Ty) {
if (!KmpRoutineEntryPtrTy) {
// Build typedef kmp_int32 (* kmp_routine_entry_t)(kmp_int32, void *); type.
auto &C = CGM.getContext();
QualType KmpRoutineEntryTyArgs[] = {KmpInt32Ty, C.VoidPtrTy};
FunctionProtoType::ExtProtoInfo EPI;
KmpRoutineEntryPtrQTy = C.getPointerType(
C.getFunctionType(KmpInt32Ty, KmpRoutineEntryTyArgs, EPI));
KmpRoutineEntryPtrTy = CGM.getTypes().ConvertType(KmpRoutineEntryPtrQTy);
}
}
static FieldDecl *addFieldToRecordDecl(ASTContext &C, DeclContext *DC,
QualType FieldTy) {
auto *Field = FieldDecl::Create(
C, DC, SourceLocation(), SourceLocation(), /*Id=*/nullptr, FieldTy,
C.getTrivialTypeSourceInfo(FieldTy, SourceLocation()),
/*BW=*/nullptr, /*Mutable=*/false, /*InitStyle=*/ICIS_NoInit);
Field->setAccess(AS_public);
DC->addDecl(Field);
return Field;
}
QualType CGOpenMPRuntime::getTgtOffloadEntryQTy() {
// Make sure the type of the entry is already created. This is the type we
// have to create:
// struct __tgt_offload_entry{
// void *addr; // Pointer to the offload entry info.
// // (function or global)
// char *name; // Name of the function or global.
// size_t size; // Size of the entry info (0 if it a function).
// int32_t flags; // Flags associated with the entry, e.g. 'link'.
// int32_t reserved; // Reserved, to use by the runtime library.
// };
if (TgtOffloadEntryQTy.isNull()) {
ASTContext &C = CGM.getContext();
auto *RD = C.buildImplicitRecord("__tgt_offload_entry");
RD->startDefinition();
addFieldToRecordDecl(C, RD, C.VoidPtrTy);
addFieldToRecordDecl(C, RD, C.getPointerType(C.CharTy));
addFieldToRecordDecl(C, RD, C.getSizeType());
addFieldToRecordDecl(
C, RD, C.getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/true));
addFieldToRecordDecl(
C, RD, C.getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/true));
RD->completeDefinition();
TgtOffloadEntryQTy = C.getRecordType(RD);
}
return TgtOffloadEntryQTy;
}
QualType CGOpenMPRuntime::getTgtDeviceImageQTy() {
// These are the types we need to build:
// struct __tgt_device_image{
// void *ImageStart; // Pointer to the target code start.
// void *ImageEnd; // Pointer to the target code end.
// // We also add the host entries to the device image, as it may be useful
// // for the target runtime to have access to that information.
// __tgt_offload_entry *EntriesBegin; // Begin of the table with all
// // the entries.
// __tgt_offload_entry *EntriesEnd; // End of the table with all the
// // entries (non inclusive).
// };
if (TgtDeviceImageQTy.isNull()) {
ASTContext &C = CGM.getContext();
auto *RD = C.buildImplicitRecord("__tgt_device_image");
RD->startDefinition();
addFieldToRecordDecl(C, RD, C.VoidPtrTy);
addFieldToRecordDecl(C, RD, C.VoidPtrTy);
addFieldToRecordDecl(C, RD, C.getPointerType(getTgtOffloadEntryQTy()));
addFieldToRecordDecl(C, RD, C.getPointerType(getTgtOffloadEntryQTy()));
RD->completeDefinition();
TgtDeviceImageQTy = C.getRecordType(RD);
}
return TgtDeviceImageQTy;
}
QualType CGOpenMPRuntime::getTgtBinaryDescriptorQTy() {
// struct __tgt_bin_desc{
// int32_t NumDevices; // Number of devices supported.
// __tgt_device_image *DeviceImages; // Arrays of device images
// // (one per device).
// __tgt_offload_entry *EntriesBegin; // Begin of the table with all the
// // entries.
// __tgt_offload_entry *EntriesEnd; // End of the table with all the
// // entries (non inclusive).
// };
if (TgtBinaryDescriptorQTy.isNull()) {
ASTContext &C = CGM.getContext();
auto *RD = C.buildImplicitRecord("__tgt_bin_desc");
RD->startDefinition();
addFieldToRecordDecl(
C, RD, C.getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/true));
addFieldToRecordDecl(C, RD, C.getPointerType(getTgtDeviceImageQTy()));
addFieldToRecordDecl(C, RD, C.getPointerType(getTgtOffloadEntryQTy()));
addFieldToRecordDecl(C, RD, C.getPointerType(getTgtOffloadEntryQTy()));
RD->completeDefinition();
TgtBinaryDescriptorQTy = C.getRecordType(RD);
}
return TgtBinaryDescriptorQTy;
}
namespace {
struct PrivateHelpersTy {
PrivateHelpersTy(const VarDecl *Original, const VarDecl *PrivateCopy,
const VarDecl *PrivateElemInit)
: Original(Original), PrivateCopy(PrivateCopy),
PrivateElemInit(PrivateElemInit) {}
const VarDecl *Original;
const VarDecl *PrivateCopy;
const VarDecl *PrivateElemInit;
};
typedef std::pair<CharUnits /*Align*/, PrivateHelpersTy> PrivateDataTy;
} // anonymous namespace
static RecordDecl *
createPrivatesRecordDecl(CodeGenModule &CGM, ArrayRef<PrivateDataTy> Privates) {
if (!Privates.empty()) {
auto &C = CGM.getContext();
// Build struct .kmp_privates_t. {
// /* private vars */
// };
auto *RD = C.buildImplicitRecord(".kmp_privates.t");
RD->startDefinition();
for (auto &&Pair : Privates) {
auto *VD = Pair.second.Original;
auto Type = VD->getType();
Type = Type.getNonReferenceType();
auto *FD = addFieldToRecordDecl(C, RD, Type);
if (VD->hasAttrs()) {
for (specific_attr_iterator<AlignedAttr> I(VD->getAttrs().begin()),
E(VD->getAttrs().end());
I != E; ++I)
FD->addAttr(*I);
}
}
RD->completeDefinition();
return RD;
}
return nullptr;
}
static RecordDecl *
createKmpTaskTRecordDecl(CodeGenModule &CGM, OpenMPDirectiveKind Kind,
QualType KmpInt32Ty,
QualType KmpRoutineEntryPointerQTy) {
auto &C = CGM.getContext();
// Build struct kmp_task_t {
// void * shareds;
// kmp_routine_entry_t routine;
// kmp_int32 part_id;
// kmp_cmplrdata_t data1;
// kmp_cmplrdata_t data2;
// For taskloops additional fields:
// kmp_uint64 lb;
// kmp_uint64 ub;
// kmp_int64 st;
// kmp_int32 liter;
// };
auto *UD = C.buildImplicitRecord("kmp_cmplrdata_t", TTK_Union);
UD->startDefinition();
addFieldToRecordDecl(C, UD, KmpInt32Ty);
addFieldToRecordDecl(C, UD, KmpRoutineEntryPointerQTy);
UD->completeDefinition();
QualType KmpCmplrdataTy = C.getRecordType(UD);
auto *RD = C.buildImplicitRecord("kmp_task_t");
RD->startDefinition();
addFieldToRecordDecl(C, RD, C.VoidPtrTy);
addFieldToRecordDecl(C, RD, KmpRoutineEntryPointerQTy);
addFieldToRecordDecl(C, RD, KmpInt32Ty);
addFieldToRecordDecl(C, RD, KmpCmplrdataTy);
addFieldToRecordDecl(C, RD, KmpCmplrdataTy);
if (isOpenMPTaskLoopDirective(Kind)) {
QualType KmpUInt64Ty =
CGM.getContext().getIntTypeForBitwidth(/*DestWidth=*/64, /*Signed=*/0);
QualType KmpInt64Ty =
CGM.getContext().getIntTypeForBitwidth(/*DestWidth=*/64, /*Signed=*/1);
addFieldToRecordDecl(C, RD, KmpUInt64Ty);
addFieldToRecordDecl(C, RD, KmpUInt64Ty);
addFieldToRecordDecl(C, RD, KmpInt64Ty);
addFieldToRecordDecl(C, RD, KmpInt32Ty);
}
RD->completeDefinition();
return RD;
}
static RecordDecl *
createKmpTaskTWithPrivatesRecordDecl(CodeGenModule &CGM, QualType KmpTaskTQTy,
ArrayRef<PrivateDataTy> Privates) {
auto &C = CGM.getContext();
// Build struct kmp_task_t_with_privates {
// kmp_task_t task_data;
// .kmp_privates_t. privates;
// };
auto *RD = C.buildImplicitRecord("kmp_task_t_with_privates");
RD->startDefinition();
addFieldToRecordDecl(C, RD, KmpTaskTQTy);
if (auto *PrivateRD = createPrivatesRecordDecl(CGM, Privates)) {
addFieldToRecordDecl(C, RD, C.getRecordType(PrivateRD));
}
RD->completeDefinition();
return RD;
}
/// \brief Emit a proxy function which accepts kmp_task_t as the second
/// argument.
/// \code
/// kmp_int32 .omp_task_entry.(kmp_int32 gtid, kmp_task_t *tt) {
/// TaskFunction(gtid, tt->part_id, &tt->privates, task_privates_map, tt,
/// For taskloops:
/// tt->task_data.lb, tt->task_data.ub, tt->task_data.st, tt->task_data.liter,
/// tt->shareds);
/// return 0;
/// }
/// \endcode
static llvm::Value *
emitProxyTaskFunction(CodeGenModule &CGM, SourceLocation Loc,
OpenMPDirectiveKind Kind, QualType KmpInt32Ty,
QualType KmpTaskTWithPrivatesPtrQTy,
QualType KmpTaskTWithPrivatesQTy, QualType KmpTaskTQTy,
QualType SharedsPtrTy, llvm::Value *TaskFunction,
llvm::Value *TaskPrivatesMap) {
auto &C = CGM.getContext();
FunctionArgList Args;
ImplicitParamDecl GtidArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, KmpInt32Ty);
ImplicitParamDecl TaskTypeArg(C, /*DC=*/nullptr, Loc,
/*Id=*/nullptr,
KmpTaskTWithPrivatesPtrQTy.withRestrict());
Args.push_back(&GtidArg);
Args.push_back(&TaskTypeArg);
auto &TaskEntryFnInfo =
CGM.getTypes().arrangeBuiltinFunctionDeclaration(KmpInt32Ty, Args);
auto *TaskEntryTy = CGM.getTypes().GetFunctionType(TaskEntryFnInfo);
auto *TaskEntry =
llvm::Function::Create(TaskEntryTy, llvm::GlobalValue::InternalLinkage,
".omp_task_entry.", &CGM.getModule());
CGM.SetInternalFunctionAttributes(/*D=*/nullptr, TaskEntry, TaskEntryFnInfo);
CodeGenFunction CGF(CGM);
CGF.disableDebugInfo();
CGF.StartFunction(GlobalDecl(), KmpInt32Ty, TaskEntry, TaskEntryFnInfo, Args);
// TaskFunction(gtid, tt->task_data.part_id, &tt->privates, task_privates_map,
// tt,
// For taskloops:
// tt->task_data.lb, tt->task_data.ub, tt->task_data.st, tt->task_data.liter,
// tt->task_data.shareds);
auto *GtidParam = CGF.EmitLoadOfScalar(
CGF.GetAddrOfLocalVar(&GtidArg), /*Volatile=*/false, KmpInt32Ty, Loc);
LValue TDBase = CGF.EmitLoadOfPointerLValue(
CGF.GetAddrOfLocalVar(&TaskTypeArg),
KmpTaskTWithPrivatesPtrQTy->castAs<PointerType>());
auto *KmpTaskTWithPrivatesQTyRD =
cast<RecordDecl>(KmpTaskTWithPrivatesQTy->getAsTagDecl());
LValue Base =
CGF.EmitLValueForField(TDBase, *KmpTaskTWithPrivatesQTyRD->field_begin());
auto *KmpTaskTQTyRD = cast<RecordDecl>(KmpTaskTQTy->getAsTagDecl());
auto PartIdFI = std::next(KmpTaskTQTyRD->field_begin(), KmpTaskTPartId);
auto PartIdLVal = CGF.EmitLValueForField(Base, *PartIdFI);
auto *PartidParam = PartIdLVal.getPointer();
auto SharedsFI = std::next(KmpTaskTQTyRD->field_begin(), KmpTaskTShareds);
auto SharedsLVal = CGF.EmitLValueForField(Base, *SharedsFI);
auto *SharedsParam = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
CGF.EmitLoadOfLValue(SharedsLVal, Loc).getScalarVal(),
CGF.ConvertTypeForMem(SharedsPtrTy));
auto PrivatesFI = std::next(KmpTaskTWithPrivatesQTyRD->field_begin(), 1);
llvm::Value *PrivatesParam;
if (PrivatesFI != KmpTaskTWithPrivatesQTyRD->field_end()) {
auto PrivatesLVal = CGF.EmitLValueForField(TDBase, *PrivatesFI);
PrivatesParam = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
PrivatesLVal.getPointer(), CGF.VoidPtrTy);
} else
PrivatesParam = llvm::ConstantPointerNull::get(CGF.VoidPtrTy);
llvm::Value *CommonArgs[] = {GtidParam, PartidParam, PrivatesParam,
TaskPrivatesMap,
CGF.Builder
.CreatePointerBitCastOrAddrSpaceCast(
TDBase.getAddress(), CGF.VoidPtrTy)
.getPointer()};
SmallVector<llvm::Value *, 16> CallArgs(std::begin(CommonArgs),
std::end(CommonArgs));
if (isOpenMPTaskLoopDirective(Kind)) {
auto LBFI = std::next(KmpTaskTQTyRD->field_begin(), KmpTaskTLowerBound);
auto LBLVal = CGF.EmitLValueForField(Base, *LBFI);
auto *LBParam = CGF.EmitLoadOfLValue(LBLVal, Loc).getScalarVal();
auto UBFI = std::next(KmpTaskTQTyRD->field_begin(), KmpTaskTUpperBound);
auto UBLVal = CGF.EmitLValueForField(Base, *UBFI);
auto *UBParam = CGF.EmitLoadOfLValue(UBLVal, Loc).getScalarVal();
auto StFI = std::next(KmpTaskTQTyRD->field_begin(), KmpTaskTStride);
auto StLVal = CGF.EmitLValueForField(Base, *StFI);
auto *StParam = CGF.EmitLoadOfLValue(StLVal, Loc).getScalarVal();
auto LIFI = std::next(KmpTaskTQTyRD->field_begin(), KmpTaskTLastIter);
auto LILVal = CGF.EmitLValueForField(Base, *LIFI);
auto *LIParam = CGF.EmitLoadOfLValue(LILVal, Loc).getScalarVal();
CallArgs.push_back(LBParam);
CallArgs.push_back(UBParam);
CallArgs.push_back(StParam);
CallArgs.push_back(LIParam);
}
CallArgs.push_back(SharedsParam);
CGF.EmitCallOrInvoke(TaskFunction, CallArgs);
CGF.EmitStoreThroughLValue(
RValue::get(CGF.Builder.getInt32(/*C=*/0)),
CGF.MakeAddrLValue(CGF.ReturnValue, KmpInt32Ty));
CGF.FinishFunction();
return TaskEntry;
}
static llvm::Value *emitDestructorsFunction(CodeGenModule &CGM,
SourceLocation Loc,
QualType KmpInt32Ty,
QualType KmpTaskTWithPrivatesPtrQTy,
QualType KmpTaskTWithPrivatesQTy) {
auto &C = CGM.getContext();
FunctionArgList Args;
ImplicitParamDecl GtidArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, KmpInt32Ty);
ImplicitParamDecl TaskTypeArg(C, /*DC=*/nullptr, Loc,
/*Id=*/nullptr,
KmpTaskTWithPrivatesPtrQTy.withRestrict());
Args.push_back(&GtidArg);
Args.push_back(&TaskTypeArg);
FunctionType::ExtInfo Info;
auto &DestructorFnInfo =
CGM.getTypes().arrangeBuiltinFunctionDeclaration(KmpInt32Ty, Args);
auto *DestructorFnTy = CGM.getTypes().GetFunctionType(DestructorFnInfo);
auto *DestructorFn =
llvm::Function::Create(DestructorFnTy, llvm::GlobalValue::InternalLinkage,
".omp_task_destructor.", &CGM.getModule());
CGM.SetInternalFunctionAttributes(/*D=*/nullptr, DestructorFn,
DestructorFnInfo);
CodeGenFunction CGF(CGM);
CGF.disableDebugInfo();
CGF.StartFunction(GlobalDecl(), KmpInt32Ty, DestructorFn, DestructorFnInfo,
Args);
LValue Base = CGF.EmitLoadOfPointerLValue(
CGF.GetAddrOfLocalVar(&TaskTypeArg),
KmpTaskTWithPrivatesPtrQTy->castAs<PointerType>());
auto *KmpTaskTWithPrivatesQTyRD =
cast<RecordDecl>(KmpTaskTWithPrivatesQTy->getAsTagDecl());
auto FI = std::next(KmpTaskTWithPrivatesQTyRD->field_begin());
Base = CGF.EmitLValueForField(Base, *FI);
for (auto *Field :
cast<RecordDecl>(FI->getType()->getAsTagDecl())->fields()) {
if (auto DtorKind = Field->getType().isDestructedType()) {
auto FieldLValue = CGF.EmitLValueForField(Base, Field);
CGF.pushDestroy(DtorKind, FieldLValue.getAddress(), Field->getType());
}
}
CGF.FinishFunction();
return DestructorFn;
}
/// \brief Emit a privates mapping function for correct handling of private and
/// firstprivate variables.
/// \code
/// void .omp_task_privates_map.(const .privates. *noalias privs, <ty1>
/// **noalias priv1,..., <tyn> **noalias privn) {
/// *priv1 = &.privates.priv1;
/// ...;
/// *privn = &.privates.privn;
/// }
/// \endcode
static llvm::Value *
emitTaskPrivateMappingFunction(CodeGenModule &CGM, SourceLocation Loc,
ArrayRef<const Expr *> PrivateVars,
ArrayRef<const Expr *> FirstprivateVars,
ArrayRef<const Expr *> LastprivateVars,
QualType PrivatesQTy,
ArrayRef<PrivateDataTy> Privates) {
auto &C = CGM.getContext();
FunctionArgList Args;
ImplicitParamDecl TaskPrivatesArg(
C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
C.getPointerType(PrivatesQTy).withConst().withRestrict());
Args.push_back(&TaskPrivatesArg);
llvm::DenseMap<const VarDecl *, unsigned> PrivateVarsPos;
unsigned Counter = 1;
for (auto *E: PrivateVars) {
Args.push_back(ImplicitParamDecl::Create(
C, /*DC=*/nullptr, Loc,
/*Id=*/nullptr, C.getPointerType(C.getPointerType(E->getType()))
.withConst()
.withRestrict()));
auto *VD = cast<VarDecl>(cast<DeclRefExpr>(E)->getDecl());
PrivateVarsPos[VD] = Counter;
++Counter;
}
for (auto *E : FirstprivateVars) {
Args.push_back(ImplicitParamDecl::Create(
C, /*DC=*/nullptr, Loc,
/*Id=*/nullptr, C.getPointerType(C.getPointerType(E->getType()))
.withConst()
.withRestrict()));
auto *VD = cast<VarDecl>(cast<DeclRefExpr>(E)->getDecl());
PrivateVarsPos[VD] = Counter;
++Counter;
}
for (auto *E: LastprivateVars) {
Args.push_back(ImplicitParamDecl::Create(
C, /*DC=*/nullptr, Loc,
/*Id=*/nullptr, C.getPointerType(C.getPointerType(E->getType()))
.withConst()
.withRestrict()));
auto *VD = cast<VarDecl>(cast<DeclRefExpr>(E)->getDecl());
PrivateVarsPos[VD] = Counter;
++Counter;
}
auto &TaskPrivatesMapFnInfo =
CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
auto *TaskPrivatesMapTy =
CGM.getTypes().GetFunctionType(TaskPrivatesMapFnInfo);
auto *TaskPrivatesMap = llvm::Function::Create(
TaskPrivatesMapTy, llvm::GlobalValue::InternalLinkage,
".omp_task_privates_map.", &CGM.getModule());
CGM.SetInternalFunctionAttributes(/*D=*/nullptr, TaskPrivatesMap,
TaskPrivatesMapFnInfo);
TaskPrivatesMap->removeFnAttr(llvm::Attribute::NoInline);
TaskPrivatesMap->addFnAttr(llvm::Attribute::AlwaysInline);
CodeGenFunction CGF(CGM);
CGF.disableDebugInfo();
CGF.StartFunction(GlobalDecl(), C.VoidTy, TaskPrivatesMap,
TaskPrivatesMapFnInfo, Args);
// *privi = &.privates.privi;
LValue Base = CGF.EmitLoadOfPointerLValue(
CGF.GetAddrOfLocalVar(&TaskPrivatesArg),
TaskPrivatesArg.getType()->castAs<PointerType>());
auto *PrivatesQTyRD = cast<RecordDecl>(PrivatesQTy->getAsTagDecl());
Counter = 0;
for (auto *Field : PrivatesQTyRD->fields()) {
auto FieldLVal = CGF.EmitLValueForField(Base, Field);
auto *VD = Args[PrivateVarsPos[Privates[Counter].second.Original]];
auto RefLVal = CGF.MakeAddrLValue(CGF.GetAddrOfLocalVar(VD), VD->getType());
auto RefLoadLVal = CGF.EmitLoadOfPointerLValue(
RefLVal.getAddress(), RefLVal.getType()->castAs<PointerType>());
CGF.EmitStoreOfScalar(FieldLVal.getPointer(), RefLoadLVal);
++Counter;
}
CGF.FinishFunction();
return TaskPrivatesMap;
}
static int array_pod_sort_comparator(const PrivateDataTy *P1,
const PrivateDataTy *P2) {
return P1->first < P2->first ? 1 : (P2->first < P1->first ? -1 : 0);
}
/// Emit initialization for private variables in task-based directives.
static void emitPrivatesInit(CodeGenFunction &CGF,
const OMPExecutableDirective &D,
Address KmpTaskSharedsPtr, LValue TDBase,
const RecordDecl *KmpTaskTWithPrivatesQTyRD,
QualType SharedsTy, QualType SharedsPtrTy,
const OMPTaskDataTy &Data,
ArrayRef<PrivateDataTy> Privates, bool ForDup) {
auto &C = CGF.getContext();
auto FI = std::next(KmpTaskTWithPrivatesQTyRD->field_begin());
LValue PrivatesBase = CGF.EmitLValueForField(TDBase, *FI);
LValue SrcBase;
if (!Data.FirstprivateVars.empty()) {
SrcBase = CGF.MakeAddrLValue(
CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
KmpTaskSharedsPtr, CGF.ConvertTypeForMem(SharedsPtrTy)),
SharedsTy);
}
CodeGenFunction::CGCapturedStmtInfo CapturesInfo(
cast<CapturedStmt>(*D.getAssociatedStmt()));
FI = cast<RecordDecl>(FI->getType()->getAsTagDecl())->field_begin();
for (auto &&Pair : Privates) {
auto *VD = Pair.second.PrivateCopy;
auto *Init = VD->getAnyInitializer();
if (Init && (!ForDup || (isa<CXXConstructExpr>(Init) &&
!CGF.isTrivialInitializer(Init)))) {
LValue PrivateLValue = CGF.EmitLValueForField(PrivatesBase, *FI);
if (auto *Elem = Pair.second.PrivateElemInit) {
auto *OriginalVD = Pair.second.Original;
auto *SharedField = CapturesInfo.lookup(OriginalVD);
auto SharedRefLValue = CGF.EmitLValueForField(SrcBase, SharedField);
SharedRefLValue = CGF.MakeAddrLValue(
Address(SharedRefLValue.getPointer(), C.getDeclAlign(OriginalVD)),
SharedRefLValue.getType(), AlignmentSource::Decl);
QualType Type = OriginalVD->getType();
if (Type->isArrayType()) {
// Initialize firstprivate array.
if (!isa<CXXConstructExpr>(Init) || CGF.isTrivialInitializer(Init)) {
// Perform simple memcpy.
CGF.EmitAggregateAssign(PrivateLValue.getAddress(),
SharedRefLValue.getAddress(), Type);
} else {
// Initialize firstprivate array using element-by-element
// intialization.
CGF.EmitOMPAggregateAssign(
PrivateLValue.getAddress(), SharedRefLValue.getAddress(), Type,
[&CGF, Elem, Init, &CapturesInfo](Address DestElement,
Address SrcElement) {
// Clean up any temporaries needed by the initialization.
CodeGenFunction::OMPPrivateScope InitScope(CGF);
InitScope.addPrivate(
Elem, [SrcElement]() -> Address { return SrcElement; });
(void)InitScope.Privatize();
// Emit initialization for single element.
CodeGenFunction::CGCapturedStmtRAII CapInfoRAII(
CGF, &CapturesInfo);
CGF.EmitAnyExprToMem(Init, DestElement,
Init->getType().getQualifiers(),
/*IsInitializer=*/false);
});
}
} else {
CodeGenFunction::OMPPrivateScope InitScope(CGF);
InitScope.addPrivate(Elem, [SharedRefLValue]() -> Address {
return SharedRefLValue.getAddress();
});
(void)InitScope.Privatize();
CodeGenFunction::CGCapturedStmtRAII CapInfoRAII(CGF, &CapturesInfo);
CGF.EmitExprAsInit(Init, VD, PrivateLValue,
/*capturedByInit=*/false);
}
} else
CGF.EmitExprAsInit(Init, VD, PrivateLValue, /*capturedByInit=*/false);
}
++FI;
}
}
/// Check if duplication function is required for taskloops.
static bool checkInitIsRequired(CodeGenFunction &CGF,
ArrayRef<PrivateDataTy> Privates) {
bool InitRequired = false;
for (auto &&Pair : Privates) {
auto *VD = Pair.second.PrivateCopy;
auto *Init = VD->getAnyInitializer();
InitRequired = InitRequired || (Init && isa<CXXConstructExpr>(Init) &&
!CGF.isTrivialInitializer(Init));
}
return InitRequired;
}
/// Emit task_dup function (for initialization of
/// private/firstprivate/lastprivate vars and last_iter flag)
/// \code
/// void __task_dup_entry(kmp_task_t *task_dst, const kmp_task_t *task_src, int
/// lastpriv) {
/// // setup lastprivate flag
/// task_dst->last = lastpriv;
/// // could be constructor calls here...
/// }
/// \endcode
static llvm::Value *
emitTaskDupFunction(CodeGenModule &CGM, SourceLocation Loc,
const OMPExecutableDirective &D,
QualType KmpTaskTWithPrivatesPtrQTy,
const RecordDecl *KmpTaskTWithPrivatesQTyRD,
const RecordDecl *KmpTaskTQTyRD, QualType SharedsTy,
QualType SharedsPtrTy, const OMPTaskDataTy &Data,
ArrayRef<PrivateDataTy> Privates, bool WithLastIter) {
auto &C = CGM.getContext();
FunctionArgList Args;
ImplicitParamDecl DstArg(C, /*DC=*/nullptr, Loc,
/*Id=*/nullptr, KmpTaskTWithPrivatesPtrQTy);
ImplicitParamDecl SrcArg(C, /*DC=*/nullptr, Loc,
/*Id=*/nullptr, KmpTaskTWithPrivatesPtrQTy);
ImplicitParamDecl LastprivArg(C, /*DC=*/nullptr, Loc,
/*Id=*/nullptr, C.IntTy);
Args.push_back(&DstArg);
Args.push_back(&SrcArg);
Args.push_back(&LastprivArg);
auto &TaskDupFnInfo =
CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
auto *TaskDupTy = CGM.getTypes().GetFunctionType(TaskDupFnInfo);
auto *TaskDup =
llvm::Function::Create(TaskDupTy, llvm::GlobalValue::InternalLinkage,
".omp_task_dup.", &CGM.getModule());
CGM.SetInternalFunctionAttributes(/*D=*/nullptr, TaskDup, TaskDupFnInfo);
CodeGenFunction CGF(CGM);
CGF.disableDebugInfo();
CGF.StartFunction(GlobalDecl(), C.VoidTy, TaskDup, TaskDupFnInfo, Args);
LValue TDBase = CGF.EmitLoadOfPointerLValue(
CGF.GetAddrOfLocalVar(&DstArg),
KmpTaskTWithPrivatesPtrQTy->castAs<PointerType>());
// task_dst->liter = lastpriv;
if (WithLastIter) {
auto LIFI = std::next(KmpTaskTQTyRD->field_begin(), KmpTaskTLastIter);
LValue Base = CGF.EmitLValueForField(
TDBase, *KmpTaskTWithPrivatesQTyRD->field_begin());
LValue LILVal = CGF.EmitLValueForField(Base, *LIFI);
llvm::Value *Lastpriv = CGF.EmitLoadOfScalar(
CGF.GetAddrOfLocalVar(&LastprivArg), /*Volatile=*/false, C.IntTy, Loc);
CGF.EmitStoreOfScalar(Lastpriv, LILVal);
}
// Emit initial values for private copies (if any).
assert(!Privates.empty());
Address KmpTaskSharedsPtr = Address::invalid();
if (!Data.FirstprivateVars.empty()) {
LValue TDBase = CGF.EmitLoadOfPointerLValue(
CGF.GetAddrOfLocalVar(&SrcArg),
KmpTaskTWithPrivatesPtrQTy->castAs<PointerType>());
LValue Base = CGF.EmitLValueForField(
TDBase, *KmpTaskTWithPrivatesQTyRD->field_begin());
KmpTaskSharedsPtr = Address(
CGF.EmitLoadOfScalar(CGF.EmitLValueForField(
Base, *std::next(KmpTaskTQTyRD->field_begin(),
KmpTaskTShareds)),
Loc),
CGF.getNaturalTypeAlignment(SharedsTy));
}
emitPrivatesInit(CGF, D, KmpTaskSharedsPtr, TDBase, KmpTaskTWithPrivatesQTyRD,
SharedsTy, SharedsPtrTy, Data, Privates, /*ForDup=*/true);
CGF.FinishFunction();
return TaskDup;
}
/// Checks if destructor function is required to be generated.
/// \return true if cleanups are required, false otherwise.
static bool
checkDestructorsRequired(const RecordDecl *KmpTaskTWithPrivatesQTyRD) {
bool NeedsCleanup = false;
auto FI = std::next(KmpTaskTWithPrivatesQTyRD->field_begin());
auto *PrivateRD = cast<RecordDecl>(FI->getType()->getAsTagDecl());
for (auto *FD : PrivateRD->fields()) {
NeedsCleanup = NeedsCleanup || FD->getType().isDestructedType();
if (NeedsCleanup)
break;
}
return NeedsCleanup;
}
CGOpenMPRuntime::TaskResultTy
CGOpenMPRuntime::emitTaskInit(CodeGenFunction &CGF, SourceLocation Loc,
const OMPExecutableDirective &D,
llvm::Value *TaskFunction, QualType SharedsTy,
Address Shareds, const OMPTaskDataTy &Data) {
auto &C = CGM.getContext();
llvm::SmallVector<PrivateDataTy, 4> Privates;
// Aggregate privates and sort them by the alignment.
auto I = Data.PrivateCopies.begin();
for (auto *E : Data.PrivateVars) {
auto *VD = cast<VarDecl>(cast<DeclRefExpr>(E)->getDecl());
Privates.push_back(std::make_pair(
C.getDeclAlign(VD),
PrivateHelpersTy(VD, cast<VarDecl>(cast<DeclRefExpr>(*I)->getDecl()),
/*PrivateElemInit=*/nullptr)));
++I;
}
I = Data.FirstprivateCopies.begin();
auto IElemInitRef = Data.FirstprivateInits.begin();
for (auto *E : Data.FirstprivateVars) {
auto *VD = cast<VarDecl>(cast<DeclRefExpr>(E)->getDecl());
Privates.push_back(std::make_pair(
C.getDeclAlign(VD),
PrivateHelpersTy(
VD, cast<VarDecl>(cast<DeclRefExpr>(*I)->getDecl()),
cast<VarDecl>(cast<DeclRefExpr>(*IElemInitRef)->getDecl()))));
++I;
++IElemInitRef;
}
I = Data.LastprivateCopies.begin();
for (auto *E : Data.LastprivateVars) {
auto *VD = cast<VarDecl>(cast<DeclRefExpr>(E)->getDecl());
Privates.push_back(std::make_pair(
C.getDeclAlign(VD),
PrivateHelpersTy(VD, cast<VarDecl>(cast<DeclRefExpr>(*I)->getDecl()),
/*PrivateElemInit=*/nullptr)));
++I;
}
llvm::array_pod_sort(Privates.begin(), Privates.end(),
array_pod_sort_comparator);
auto KmpInt32Ty = C.getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/1);
// Build type kmp_routine_entry_t (if not built yet).
emitKmpRoutineEntryT(KmpInt32Ty);
// Build type kmp_task_t (if not built yet).
if (KmpTaskTQTy.isNull()) {
KmpTaskTQTy = C.getRecordType(createKmpTaskTRecordDecl(
CGM, D.getDirectiveKind(), KmpInt32Ty, KmpRoutineEntryPtrQTy));
}
auto *KmpTaskTQTyRD = cast<RecordDecl>(KmpTaskTQTy->getAsTagDecl());
// Build particular struct kmp_task_t for the given task.
auto *KmpTaskTWithPrivatesQTyRD =
createKmpTaskTWithPrivatesRecordDecl(CGM, KmpTaskTQTy, Privates);
auto KmpTaskTWithPrivatesQTy = C.getRecordType(KmpTaskTWithPrivatesQTyRD);
QualType KmpTaskTWithPrivatesPtrQTy =
C.getPointerType(KmpTaskTWithPrivatesQTy);
auto *KmpTaskTWithPrivatesTy = CGF.ConvertType(KmpTaskTWithPrivatesQTy);
auto *KmpTaskTWithPrivatesPtrTy = KmpTaskTWithPrivatesTy->getPointerTo();
auto *KmpTaskTWithPrivatesTySize = CGF.getTypeSize(KmpTaskTWithPrivatesQTy);
QualType SharedsPtrTy = C.getPointerType(SharedsTy);
// Emit initial values for private copies (if any).
llvm::Value *TaskPrivatesMap = nullptr;
auto *TaskPrivatesMapTy =
std::next(cast<llvm::Function>(TaskFunction)->getArgumentList().begin(),
3)
->getType();
if (!Privates.empty()) {
auto FI = std::next(KmpTaskTWithPrivatesQTyRD->field_begin());
TaskPrivatesMap = emitTaskPrivateMappingFunction(
CGM, Loc, Data.PrivateVars, Data.FirstprivateVars, Data.LastprivateVars,
FI->getType(), Privates);
TaskPrivatesMap = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
TaskPrivatesMap, TaskPrivatesMapTy);
} else {
TaskPrivatesMap = llvm::ConstantPointerNull::get(
cast<llvm::PointerType>(TaskPrivatesMapTy));
}
// Build a proxy function kmp_int32 .omp_task_entry.(kmp_int32 gtid,
// kmp_task_t *tt);
auto *TaskEntry = emitProxyTaskFunction(
CGM, Loc, D.getDirectiveKind(), KmpInt32Ty, KmpTaskTWithPrivatesPtrQTy,
KmpTaskTWithPrivatesQTy, KmpTaskTQTy, SharedsPtrTy, TaskFunction,
TaskPrivatesMap);
// Build call kmp_task_t * __kmpc_omp_task_alloc(ident_t *, kmp_int32 gtid,
// kmp_int32 flags, size_t sizeof_kmp_task_t, size_t sizeof_shareds,
// kmp_routine_entry_t *task_entry);
// Task flags. Format is taken from
// http://llvm.org/svn/llvm-project/openmp/trunk/runtime/src/kmp.h,
// description of kmp_tasking_flags struct.
enum {
TiedFlag = 0x1,
FinalFlag = 0x2,
DestructorsFlag = 0x8,
PriorityFlag = 0x20
};
unsigned Flags = Data.Tied ? TiedFlag : 0;
bool NeedsCleanup = false;
if (!Privates.empty()) {
NeedsCleanup = checkDestructorsRequired(KmpTaskTWithPrivatesQTyRD);
if (NeedsCleanup)
Flags = Flags | DestructorsFlag;
}
if (Data.Priority.getInt())
Flags = Flags | PriorityFlag;
auto *TaskFlags =
Data.Final.getPointer()
? CGF.Builder.CreateSelect(Data.Final.getPointer(),
CGF.Builder.getInt32(FinalFlag),
CGF.Builder.getInt32(/*C=*/0))
: CGF.Builder.getInt32(Data.Final.getInt() ? FinalFlag : 0);
TaskFlags = CGF.Builder.CreateOr(TaskFlags, CGF.Builder.getInt32(Flags));
auto *SharedsSize = CGM.getSize(C.getTypeSizeInChars(SharedsTy));
llvm::Value *AllocArgs[] = {emitUpdateLocation(CGF, Loc),
getThreadID(CGF, Loc), TaskFlags,
KmpTaskTWithPrivatesTySize, SharedsSize,
CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
TaskEntry, KmpRoutineEntryPtrTy)};
auto *NewTask = CGF.EmitRuntimeCall(
createRuntimeFunction(OMPRTL__kmpc_omp_task_alloc), AllocArgs);
auto *NewTaskNewTaskTTy = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
NewTask, KmpTaskTWithPrivatesPtrTy);
LValue Base = CGF.MakeNaturalAlignAddrLValue(NewTaskNewTaskTTy,
KmpTaskTWithPrivatesQTy);
LValue TDBase =
CGF.EmitLValueForField(Base, *KmpTaskTWithPrivatesQTyRD->field_begin());
// Fill the data in the resulting kmp_task_t record.
// Copy shareds if there are any.
Address KmpTaskSharedsPtr = Address::invalid();
if (!SharedsTy->getAsStructureType()->getDecl()->field_empty()) {
KmpTaskSharedsPtr =
Address(CGF.EmitLoadOfScalar(
CGF.EmitLValueForField(
TDBase, *std::next(KmpTaskTQTyRD->field_begin(),
KmpTaskTShareds)),
Loc),
CGF.getNaturalTypeAlignment(SharedsTy));
CGF.EmitAggregateCopy(KmpTaskSharedsPtr, Shareds, SharedsTy);
}
// Emit initial values for private copies (if any).
TaskResultTy Result;
if (!Privates.empty()) {
emitPrivatesInit(CGF, D, KmpTaskSharedsPtr, Base, KmpTaskTWithPrivatesQTyRD,
SharedsTy, SharedsPtrTy, Data, Privates,
/*ForDup=*/false);
if (isOpenMPTaskLoopDirective(D.getDirectiveKind()) &&
(!Data.LastprivateVars.empty() || checkInitIsRequired(CGF, Privates))) {
Result.TaskDupFn = emitTaskDupFunction(
CGM, Loc, D, KmpTaskTWithPrivatesPtrQTy, KmpTaskTWithPrivatesQTyRD,
KmpTaskTQTyRD, SharedsTy, SharedsPtrTy, Data, Privates,
/*WithLastIter=*/!Data.LastprivateVars.empty());
}
}
// Fields of union "kmp_cmplrdata_t" for destructors and priority.
enum { Priority = 0, Destructors = 1 };
// Provide pointer to function with destructors for privates.
auto FI = std::next(KmpTaskTQTyRD->field_begin(), Data1);
auto *KmpCmplrdataUD = (*FI)->getType()->getAsUnionType()->getDecl();
if (NeedsCleanup) {
llvm::Value *DestructorFn = emitDestructorsFunction(
CGM, Loc, KmpInt32Ty, KmpTaskTWithPrivatesPtrQTy,
KmpTaskTWithPrivatesQTy);
LValue Data1LV = CGF.EmitLValueForField(TDBase, *FI);
LValue DestructorsLV = CGF.EmitLValueForField(
Data1LV, *std::next(KmpCmplrdataUD->field_begin(), Destructors));
CGF.EmitStoreOfScalar(CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
DestructorFn, KmpRoutineEntryPtrTy),
DestructorsLV);
}
// Set priority.
if (Data.Priority.getInt()) {
LValue Data2LV = CGF.EmitLValueForField(
TDBase, *std::next(KmpTaskTQTyRD->field_begin(), Data2));
LValue PriorityLV = CGF.EmitLValueForField(
Data2LV, *std::next(KmpCmplrdataUD->field_begin(), Priority));
CGF.EmitStoreOfScalar(Data.Priority.getPointer(), PriorityLV);
}
Result.NewTask = NewTask;
Result.TaskEntry = TaskEntry;
Result.NewTaskNewTaskTTy = NewTaskNewTaskTTy;
Result.TDBase = TDBase;
Result.KmpTaskTQTyRD = KmpTaskTQTyRD;
return Result;
}
void CGOpenMPRuntime::emitTaskCall(CodeGenFunction &CGF, SourceLocation Loc,
const OMPExecutableDirective &D,
llvm::Value *TaskFunction,
QualType SharedsTy, Address Shareds,
const Expr *IfCond,
const OMPTaskDataTy &Data) {
if (!CGF.HaveInsertPoint())
return;
TaskResultTy Result =
emitTaskInit(CGF, Loc, D, TaskFunction, SharedsTy, Shareds, Data);
llvm::Value *NewTask = Result.NewTask;
llvm::Value *TaskEntry = Result.TaskEntry;
llvm::Value *NewTaskNewTaskTTy = Result.NewTaskNewTaskTTy;
LValue TDBase = Result.TDBase;
RecordDecl *KmpTaskTQTyRD = Result.KmpTaskTQTyRD;
auto &C = CGM.getContext();
// Process list of dependences.
Address DependenciesArray = Address::invalid();
unsigned NumDependencies = Data.Dependences.size();
if (NumDependencies) {
// Dependence kind for RTL.
enum RTLDependenceKindTy { DepIn = 0x01, DepInOut = 0x3 };
enum RTLDependInfoFieldsTy { BaseAddr, Len, Flags };
RecordDecl *KmpDependInfoRD;
QualType FlagsTy =
C.getIntTypeForBitwidth(C.getTypeSize(C.BoolTy), /*Signed=*/false);
llvm::Type *LLVMFlagsTy = CGF.ConvertTypeForMem(FlagsTy);
if (KmpDependInfoTy.isNull()) {
KmpDependInfoRD = C.buildImplicitRecord("kmp_depend_info");
KmpDependInfoRD->startDefinition();
addFieldToRecordDecl(C, KmpDependInfoRD, C.getIntPtrType());
addFieldToRecordDecl(C, KmpDependInfoRD, C.getSizeType());
addFieldToRecordDecl(C, KmpDependInfoRD, FlagsTy);
KmpDependInfoRD->completeDefinition();
KmpDependInfoTy = C.getRecordType(KmpDependInfoRD);
} else
KmpDependInfoRD = cast<RecordDecl>(KmpDependInfoTy->getAsTagDecl());
CharUnits DependencySize = C.getTypeSizeInChars(KmpDependInfoTy);
// Define type kmp_depend_info[<Dependences.size()>];
QualType KmpDependInfoArrayTy = C.getConstantArrayType(
KmpDependInfoTy, llvm::APInt(/*numBits=*/64, NumDependencies),
ArrayType::Normal, /*IndexTypeQuals=*/0);
// kmp_depend_info[<Dependences.size()>] deps;
DependenciesArray =
CGF.CreateMemTemp(KmpDependInfoArrayTy, ".dep.arr.addr");
for (unsigned i = 0; i < NumDependencies; ++i) {
const Expr *E = Data.Dependences[i].second;
auto Addr = CGF.EmitLValue(E);
llvm::Value *Size;
QualType Ty = E->getType();
if (auto *ASE = dyn_cast<OMPArraySectionExpr>(E->IgnoreParenImpCasts())) {
LValue UpAddrLVal =
CGF.EmitOMPArraySectionExpr(ASE, /*LowerBound=*/false);
llvm::Value *UpAddr =
CGF.Builder.CreateConstGEP1_32(UpAddrLVal.getPointer(), /*Idx0=*/1);
llvm::Value *LowIntPtr =
CGF.Builder.CreatePtrToInt(Addr.getPointer(), CGM.SizeTy);
llvm::Value *UpIntPtr = CGF.Builder.CreatePtrToInt(UpAddr, CGM.SizeTy);
Size = CGF.Builder.CreateNUWSub(UpIntPtr, LowIntPtr);
} else
Size = CGF.getTypeSize(Ty);
auto Base = CGF.MakeAddrLValue(
CGF.Builder.CreateConstArrayGEP(DependenciesArray, i, DependencySize),
KmpDependInfoTy);
// deps[i].base_addr = &<Dependences[i].second>;
auto BaseAddrLVal = CGF.EmitLValueForField(
Base, *std::next(KmpDependInfoRD->field_begin(), BaseAddr));
CGF.EmitStoreOfScalar(
CGF.Builder.CreatePtrToInt(Addr.getPointer(), CGF.IntPtrTy),
BaseAddrLVal);
// deps[i].len = sizeof(<Dependences[i].second>);
auto LenLVal = CGF.EmitLValueForField(
Base, *std::next(KmpDependInfoRD->field_begin(), Len));
CGF.EmitStoreOfScalar(Size, LenLVal);
// deps[i].flags = <Dependences[i].first>;
RTLDependenceKindTy DepKind;
switch (Data.Dependences[i].first) {
case OMPC_DEPEND_in:
DepKind = DepIn;
break;
// Out and InOut dependencies must use the same code.
case OMPC_DEPEND_out:
case OMPC_DEPEND_inout:
DepKind = DepInOut;
break;
case OMPC_DEPEND_source:
case OMPC_DEPEND_sink:
case OMPC_DEPEND_unknown:
llvm_unreachable("Unknown task dependence type");
}
auto FlagsLVal = CGF.EmitLValueForField(
Base, *std::next(KmpDependInfoRD->field_begin(), Flags));
CGF.EmitStoreOfScalar(llvm::ConstantInt::get(LLVMFlagsTy, DepKind),
FlagsLVal);
}
DependenciesArray = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
CGF.Builder.CreateStructGEP(DependenciesArray, 0, CharUnits::Zero()),
CGF.VoidPtrTy);
}
// NOTE: routine and part_id fields are intialized by __kmpc_omp_task_alloc()
// libcall.
// Build kmp_int32 __kmpc_omp_task_with_deps(ident_t *, kmp_int32 gtid,
// kmp_task_t *new_task, kmp_int32 ndeps, kmp_depend_info_t *dep_list,
// kmp_int32 ndeps_noalias, kmp_depend_info_t *noalias_dep_list) if dependence
// list is not empty
auto *ThreadID = getThreadID(CGF, Loc);
auto *UpLoc = emitUpdateLocation(CGF, Loc);
llvm::Value *TaskArgs[] = { UpLoc, ThreadID, NewTask };
llvm::Value *DepTaskArgs[7];
if (NumDependencies) {
DepTaskArgs[0] = UpLoc;
DepTaskArgs[1] = ThreadID;
DepTaskArgs[2] = NewTask;
DepTaskArgs[3] = CGF.Builder.getInt32(NumDependencies);
DepTaskArgs[4] = DependenciesArray.getPointer();
DepTaskArgs[5] = CGF.Builder.getInt32(0);
DepTaskArgs[6] = llvm::ConstantPointerNull::get(CGF.VoidPtrTy);
}
auto &&ThenCodeGen = [this, Loc, &Data, TDBase, KmpTaskTQTyRD,
NumDependencies, &TaskArgs,
&DepTaskArgs](CodeGenFunction &CGF, PrePostActionTy &) {
if (!Data.Tied) {
auto PartIdFI = std::next(KmpTaskTQTyRD->field_begin(), KmpTaskTPartId);
auto PartIdLVal = CGF.EmitLValueForField(TDBase, *PartIdFI);
CGF.EmitStoreOfScalar(CGF.Builder.getInt32(0), PartIdLVal);
}
if (NumDependencies) {
CGF.EmitRuntimeCall(
createRuntimeFunction(OMPRTL__kmpc_omp_task_with_deps), DepTaskArgs);
} else {
CGF.EmitRuntimeCall(createRuntimeFunction(OMPRTL__kmpc_omp_task),
TaskArgs);
}
// Check if parent region is untied and build return for untied task;
if (auto *Region =
dyn_cast_or_null<CGOpenMPRegionInfo>(CGF.CapturedStmtInfo))
Region->emitUntiedSwitch(CGF);
};
llvm::Value *DepWaitTaskArgs[6];
if (NumDependencies) {
DepWaitTaskArgs[0] = UpLoc;
DepWaitTaskArgs[1] = ThreadID;
DepWaitTaskArgs[2] = CGF.Builder.getInt32(NumDependencies);
DepWaitTaskArgs[3] = DependenciesArray.getPointer();
DepWaitTaskArgs[4] = CGF.Builder.getInt32(0);
DepWaitTaskArgs[5] = llvm::ConstantPointerNull::get(CGF.VoidPtrTy);
}
auto &&ElseCodeGen = [&TaskArgs, ThreadID, NewTaskNewTaskTTy, TaskEntry,
NumDependencies, &DepWaitTaskArgs](CodeGenFunction &CGF,
PrePostActionTy &) {
auto &RT = CGF.CGM.getOpenMPRuntime();
CodeGenFunction::RunCleanupsScope LocalScope(CGF);
// Build void __kmpc_omp_wait_deps(ident_t *, kmp_int32 gtid,
// kmp_int32 ndeps, kmp_depend_info_t *dep_list, kmp_int32
// ndeps_noalias, kmp_depend_info_t *noalias_dep_list); if dependence info
// is specified.
if (NumDependencies)
CGF.EmitRuntimeCall(RT.createRuntimeFunction(OMPRTL__kmpc_omp_wait_deps),
DepWaitTaskArgs);
// Call proxy_task_entry(gtid, new_task);
auto &&CodeGen = [TaskEntry, ThreadID, NewTaskNewTaskTTy](
CodeGenFunction &CGF, PrePostActionTy &Action) {
Action.Enter(CGF);
llvm::Value *OutlinedFnArgs[] = {ThreadID, NewTaskNewTaskTTy};
CGF.EmitCallOrInvoke(TaskEntry, OutlinedFnArgs);
};
// Build void __kmpc_omp_task_begin_if0(ident_t *, kmp_int32 gtid,
// kmp_task_t *new_task);
// Build void __kmpc_omp_task_complete_if0(ident_t *, kmp_int32 gtid,
// kmp_task_t *new_task);
RegionCodeGenTy RCG(CodeGen);
CommonActionTy Action(
RT.createRuntimeFunction(OMPRTL__kmpc_omp_task_begin_if0), TaskArgs,
RT.createRuntimeFunction(OMPRTL__kmpc_omp_task_complete_if0), TaskArgs);
RCG.setAction(Action);
RCG(CGF);
};
if (IfCond)
emitOMPIfClause(CGF, IfCond, ThenCodeGen, ElseCodeGen);
else {
RegionCodeGenTy ThenRCG(ThenCodeGen);
ThenRCG(CGF);
}
}
void CGOpenMPRuntime::emitTaskLoopCall(CodeGenFunction &CGF, SourceLocation Loc,
const OMPLoopDirective &D,
llvm::Value *TaskFunction,
QualType SharedsTy, Address Shareds,
const Expr *IfCond,
const OMPTaskDataTy &Data) {
if (!CGF.HaveInsertPoint())
return;
TaskResultTy Result =
emitTaskInit(CGF, Loc, D, TaskFunction, SharedsTy, Shareds, Data);
// NOTE: routine and part_id fields are intialized by __kmpc_omp_task_alloc()
// libcall.
// Call to void __kmpc_taskloop(ident_t *loc, int gtid, kmp_task_t *task, int
// if_val, kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st, int nogroup, int
// sched, kmp_uint64 grainsize, void *task_dup);
llvm::Value *ThreadID = getThreadID(CGF, Loc);
llvm::Value *UpLoc = emitUpdateLocation(CGF, Loc);
llvm::Value *IfVal;
if (IfCond) {
IfVal = CGF.Builder.CreateIntCast(CGF.EvaluateExprAsBool(IfCond), CGF.IntTy,
/*isSigned=*/true);
} else
IfVal = llvm::ConstantInt::getSigned(CGF.IntTy, /*V=*/1);
LValue LBLVal = CGF.EmitLValueForField(
Result.TDBase,
*std::next(Result.KmpTaskTQTyRD->field_begin(), KmpTaskTLowerBound));
auto *LBVar =
cast<VarDecl>(cast<DeclRefExpr>(D.getLowerBoundVariable())->getDecl());
CGF.EmitAnyExprToMem(LBVar->getInit(), LBLVal.getAddress(), LBLVal.getQuals(),
/*IsInitializer=*/true);
LValue UBLVal = CGF.EmitLValueForField(
Result.TDBase,
*std::next(Result.KmpTaskTQTyRD->field_begin(), KmpTaskTUpperBound));
auto *UBVar =
cast<VarDecl>(cast<DeclRefExpr>(D.getUpperBoundVariable())->getDecl());
CGF.EmitAnyExprToMem(UBVar->getInit(), UBLVal.getAddress(), UBLVal.getQuals(),
/*IsInitializer=*/true);
LValue StLVal = CGF.EmitLValueForField(
Result.TDBase,
*std::next(Result.KmpTaskTQTyRD->field_begin(), KmpTaskTStride));
auto *StVar =
cast<VarDecl>(cast<DeclRefExpr>(D.getStrideVariable())->getDecl());
CGF.EmitAnyExprToMem(StVar->getInit(), StLVal.getAddress(), StLVal.getQuals(),
/*IsInitializer=*/true);
enum { NoSchedule = 0, Grainsize = 1, NumTasks = 2 };
llvm::Value *TaskArgs[] = {
UpLoc, ThreadID, Result.NewTask, IfVal, LBLVal.getPointer(),
UBLVal.getPointer(), CGF.EmitLoadOfScalar(StLVal, SourceLocation()),
llvm::ConstantInt::getSigned(CGF.IntTy, Data.Nogroup ? 1 : 0),
llvm::ConstantInt::getSigned(
CGF.IntTy, Data.Schedule.getPointer()
? Data.Schedule.getInt() ? NumTasks : Grainsize
: NoSchedule),
Data.Schedule.getPointer()
? CGF.Builder.CreateIntCast(Data.Schedule.getPointer(), CGF.Int64Ty,
/*isSigned=*/false)
: llvm::ConstantInt::get(CGF.Int64Ty, /*V=*/0),
Result.TaskDupFn
? CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(Result.TaskDupFn,
CGF.VoidPtrTy)
: llvm::ConstantPointerNull::get(CGF.VoidPtrTy)};
CGF.EmitRuntimeCall(createRuntimeFunction(OMPRTL__kmpc_taskloop), TaskArgs);
}
/// \brief Emit reduction operation for each element of array (required for
/// array sections) LHS op = RHS.
/// \param Type Type of array.
/// \param LHSVar Variable on the left side of the reduction operation
/// (references element of array in original variable).
/// \param RHSVar Variable on the right side of the reduction operation
/// (references element of array in original variable).
/// \param RedOpGen Generator of reduction operation with use of LHSVar and
/// RHSVar.
static void EmitOMPAggregateReduction(
CodeGenFunction &CGF, QualType Type, const VarDecl *LHSVar,
const VarDecl *RHSVar,
const llvm::function_ref<void(CodeGenFunction &CGF, const Expr *,
const Expr *, const Expr *)> &RedOpGen,
const Expr *XExpr = nullptr, const Expr *EExpr = nullptr,
const Expr *UpExpr = nullptr) {
// Perform element-by-element initialization.
QualType ElementTy;
Address LHSAddr = CGF.GetAddrOfLocalVar(LHSVar);
Address RHSAddr = CGF.GetAddrOfLocalVar(RHSVar);
// Drill down to the base element type on both arrays.
auto ArrayTy = Type->getAsArrayTypeUnsafe();
auto NumElements = CGF.emitArrayLength(ArrayTy, ElementTy, LHSAddr);
auto RHSBegin = RHSAddr.getPointer();
auto LHSBegin = LHSAddr.getPointer();
// Cast from pointer to array type to pointer to single element.
auto LHSEnd = CGF.Builder.CreateGEP(LHSBegin, NumElements);
// The basic structure here is a while-do loop.
auto BodyBB = CGF.createBasicBlock("omp.arraycpy.body");
auto DoneBB = CGF.createBasicBlock("omp.arraycpy.done");
auto IsEmpty =
CGF.Builder.CreateICmpEQ(LHSBegin, LHSEnd, "omp.arraycpy.isempty");
CGF.Builder.CreateCondBr(IsEmpty, DoneBB, BodyBB);
// Enter the loop body, making that address the current address.
auto EntryBB = CGF.Builder.GetInsertBlock();
CGF.EmitBlock(BodyBB);
CharUnits ElementSize = CGF.getContext().getTypeSizeInChars(ElementTy);
llvm::PHINode *RHSElementPHI = CGF.Builder.CreatePHI(
RHSBegin->getType(), 2, "omp.arraycpy.srcElementPast");
RHSElementPHI->addIncoming(RHSBegin, EntryBB);
Address RHSElementCurrent =
Address(RHSElementPHI,
RHSAddr.getAlignment().alignmentOfArrayElement(ElementSize));
llvm::PHINode *LHSElementPHI = CGF.Builder.CreatePHI(
LHSBegin->getType(), 2, "omp.arraycpy.destElementPast");
LHSElementPHI->addIncoming(LHSBegin, EntryBB);
Address LHSElementCurrent =
Address(LHSElementPHI,
LHSAddr.getAlignment().alignmentOfArrayElement(ElementSize));
// Emit copy.
CodeGenFunction::OMPPrivateScope Scope(CGF);
Scope.addPrivate(LHSVar, [=]() -> Address { return LHSElementCurrent; });
Scope.addPrivate(RHSVar, [=]() -> Address { return RHSElementCurrent; });
Scope.Privatize();
RedOpGen(CGF, XExpr, EExpr, UpExpr);
Scope.ForceCleanup();
// Shift the address forward by one element.
auto LHSElementNext = CGF.Builder.CreateConstGEP1_32(
LHSElementPHI, /*Idx0=*/1, "omp.arraycpy.dest.element");
auto RHSElementNext = CGF.Builder.CreateConstGEP1_32(
RHSElementPHI, /*Idx0=*/1, "omp.arraycpy.src.element");
// Check whether we've reached the end.
auto Done =
CGF.Builder.CreateICmpEQ(LHSElementNext, LHSEnd, "omp.arraycpy.done");
CGF.Builder.CreateCondBr(Done, DoneBB, BodyBB);
LHSElementPHI->addIncoming(LHSElementNext, CGF.Builder.GetInsertBlock());
RHSElementPHI->addIncoming(RHSElementNext, CGF.Builder.GetInsertBlock());
// Done.
CGF.EmitBlock(DoneBB, /*IsFinished=*/true);
}
/// Emit reduction combiner. If the combiner is a simple expression emit it as
/// is, otherwise consider it as combiner of UDR decl and emit it as a call of
/// UDR combiner function.
static void emitReductionCombiner(CodeGenFunction &CGF,
const Expr *ReductionOp) {
if (auto *CE = dyn_cast<CallExpr>(ReductionOp))
if (auto *OVE = dyn_cast<OpaqueValueExpr>(CE->getCallee()))
if (auto *DRE =
dyn_cast<DeclRefExpr>(OVE->getSourceExpr()->IgnoreImpCasts()))
if (auto *DRD = dyn_cast<OMPDeclareReductionDecl>(DRE->getDecl())) {
std::pair<llvm::Function *, llvm::Function *> Reduction =
CGF.CGM.getOpenMPRuntime().getUserDefinedReduction(DRD);
RValue Func = RValue::get(Reduction.first);
CodeGenFunction::OpaqueValueMapping Map(CGF, OVE, Func);
CGF.EmitIgnoredExpr(ReductionOp);
return;
}
CGF.EmitIgnoredExpr(ReductionOp);
}
static llvm::Value *emitReductionFunction(CodeGenModule &CGM,
llvm::Type *ArgsType,
ArrayRef<const Expr *> Privates,
ArrayRef<const Expr *> LHSExprs,
ArrayRef<const Expr *> RHSExprs,
ArrayRef<const Expr *> ReductionOps) {
auto &C = CGM.getContext();
// void reduction_func(void *LHSArg, void *RHSArg);
FunctionArgList Args;
ImplicitParamDecl LHSArg(C, /*DC=*/nullptr, SourceLocation(), /*Id=*/nullptr,
C.VoidPtrTy);
ImplicitParamDecl RHSArg(C, /*DC=*/nullptr, SourceLocation(), /*Id=*/nullptr,
C.VoidPtrTy);
Args.push_back(&LHSArg);
Args.push_back(&RHSArg);
auto &CGFI = CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
auto *Fn = llvm::Function::Create(
CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
".omp.reduction.reduction_func", &CGM.getModule());
CGM.SetInternalFunctionAttributes(/*D=*/nullptr, Fn, CGFI);
CodeGenFunction CGF(CGM);
CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args);
// Dst = (void*[n])(LHSArg);
// Src = (void*[n])(RHSArg);
Address LHS(CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
CGF.Builder.CreateLoad(CGF.GetAddrOfLocalVar(&LHSArg)),
ArgsType), CGF.getPointerAlign());
Address RHS(CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
CGF.Builder.CreateLoad(CGF.GetAddrOfLocalVar(&RHSArg)),
ArgsType), CGF.getPointerAlign());
// ...
// *(Type<i>*)lhs[i] = RedOp<i>(*(Type<i>*)lhs[i], *(Type<i>*)rhs[i]);
// ...
CodeGenFunction::OMPPrivateScope Scope(CGF);
auto IPriv = Privates.begin();
unsigned Idx = 0;
for (unsigned I = 0, E = ReductionOps.size(); I < E; ++I, ++IPriv, ++Idx) {
auto RHSVar = cast<VarDecl>(cast<DeclRefExpr>(RHSExprs[I])->getDecl());
Scope.addPrivate(RHSVar, [&]() -> Address {
return emitAddrOfVarFromArray(CGF, RHS, Idx, RHSVar);
});
auto LHSVar = cast<VarDecl>(cast<DeclRefExpr>(LHSExprs[I])->getDecl());
Scope.addPrivate(LHSVar, [&]() -> Address {
return emitAddrOfVarFromArray(CGF, LHS, Idx, LHSVar);
});
QualType PrivTy = (*IPriv)->getType();
if (PrivTy->isVariablyModifiedType()) {
// Get array size and emit VLA type.
++Idx;
Address Elem =
CGF.Builder.CreateConstArrayGEP(LHS, Idx, CGF.getPointerSize());
llvm::Value *Ptr = CGF.Builder.CreateLoad(Elem);
auto *VLA = CGF.getContext().getAsVariableArrayType(PrivTy);
auto *OVE = cast<OpaqueValueExpr>(VLA->getSizeExpr());
CodeGenFunction::OpaqueValueMapping OpaqueMap(
CGF, OVE, RValue::get(CGF.Builder.CreatePtrToInt(Ptr, CGF.SizeTy)));
CGF.EmitVariablyModifiedType(PrivTy);
}
}
Scope.Privatize();
IPriv = Privates.begin();
auto ILHS = LHSExprs.begin();
auto IRHS = RHSExprs.begin();
for (auto *E : ReductionOps) {
if ((*IPriv)->getType()->isArrayType()) {
// Emit reduction for array section.
auto *LHSVar = cast<VarDecl>(cast<DeclRefExpr>(*ILHS)->getDecl());
auto *RHSVar = cast<VarDecl>(cast<DeclRefExpr>(*IRHS)->getDecl());
EmitOMPAggregateReduction(
CGF, (*IPriv)->getType(), LHSVar, RHSVar,
[=](CodeGenFunction &CGF, const Expr *, const Expr *, const Expr *) {
emitReductionCombiner(CGF, E);
});
} else
// Emit reduction for array subscript or single variable.
emitReductionCombiner(CGF, E);
++IPriv;
++ILHS;
++IRHS;
}
Scope.ForceCleanup();
CGF.FinishFunction();
return Fn;
}
static void emitSingleReductionCombiner(CodeGenFunction &CGF,
const Expr *ReductionOp,
const Expr *PrivateRef,
const DeclRefExpr *LHS,
const DeclRefExpr *RHS) {
if (PrivateRef->getType()->isArrayType()) {
// Emit reduction for array section.
auto *LHSVar = cast<VarDecl>(LHS->getDecl());
auto *RHSVar = cast<VarDecl>(RHS->getDecl());
EmitOMPAggregateReduction(
CGF, PrivateRef->getType(), LHSVar, RHSVar,
[=](CodeGenFunction &CGF, const Expr *, const Expr *, const Expr *) {
emitReductionCombiner(CGF, ReductionOp);
});
} else
// Emit reduction for array subscript or single variable.
emitReductionCombiner(CGF, ReductionOp);
}
void CGOpenMPRuntime::emitReduction(CodeGenFunction &CGF, SourceLocation Loc,
ArrayRef<const Expr *> Privates,
ArrayRef<const Expr *> LHSExprs,
ArrayRef<const Expr *> RHSExprs,
ArrayRef<const Expr *> ReductionOps,
bool WithNowait, bool SimpleReduction) {
if (!CGF.HaveInsertPoint())
return;
// Next code should be emitted for reduction:
//
// static kmp_critical_name lock = { 0 };
//
// void reduce_func(void *lhs[<n>], void *rhs[<n>]) {
// *(Type0*)lhs[0] = ReductionOperation0(*(Type0*)lhs[0], *(Type0*)rhs[0]);
// ...
// *(Type<n>-1*)lhs[<n>-1] = ReductionOperation<n>-1(*(Type<n>-1*)lhs[<n>-1],
// *(Type<n>-1*)rhs[<n>-1]);
// }
//
// ...
// void *RedList[<n>] = {&<RHSExprs>[0], ..., &<RHSExprs>[<n>-1]};
// switch (__kmpc_reduce{_nowait}(<loc>, <gtid>, <n>, sizeof(RedList),
// RedList, reduce_func, &<lock>)) {
// case 1:
// ...
// <LHSExprs>[i] = RedOp<i>(*<LHSExprs>[i], *<RHSExprs>[i]);
// ...
// __kmpc_end_reduce{_nowait}(<loc>, <gtid>, &<lock>);
// break;
// case 2:
// ...
// Atomic(<LHSExprs>[i] = RedOp<i>(*<LHSExprs>[i], *<RHSExprs>[i]));
// ...
// [__kmpc_end_reduce(<loc>, <gtid>, &<lock>);]
// break;
// default:;
// }
//
// if SimpleReduction is true, only the next code is generated:
// ...
// <LHSExprs>[i] = RedOp<i>(*<LHSExprs>[i], *<RHSExprs>[i]);
// ...
auto &C = CGM.getContext();
if (SimpleReduction) {
CodeGenFunction::RunCleanupsScope Scope(CGF);
auto IPriv = Privates.begin();
auto ILHS = LHSExprs.begin();
auto IRHS = RHSExprs.begin();
for (auto *E : ReductionOps) {
emitSingleReductionCombiner(CGF, E, *IPriv, cast<DeclRefExpr>(*ILHS),
cast<DeclRefExpr>(*IRHS));
++IPriv;
++ILHS;
++IRHS;
}
return;
}
// 1. Build a list of reduction variables.
// void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
auto Size = RHSExprs.size();
for (auto *E : Privates) {
if (E->getType()->isVariablyModifiedType())
// Reserve place for array size.
++Size;
}
llvm::APInt ArraySize(/*unsigned int numBits=*/32, Size);
QualType ReductionArrayTy =
C.getConstantArrayType(C.VoidPtrTy, ArraySize, ArrayType::Normal,
/*IndexTypeQuals=*/0);
Address ReductionList =
CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list");
auto IPriv = Privates.begin();
unsigned Idx = 0;
for (unsigned I = 0, E = RHSExprs.size(); I < E; ++I, ++IPriv, ++Idx) {
Address Elem =
CGF.Builder.CreateConstArrayGEP(ReductionList, Idx, CGF.getPointerSize());
CGF.Builder.CreateStore(
CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
CGF.EmitLValue(RHSExprs[I]).getPointer(), CGF.VoidPtrTy),
Elem);
if ((*IPriv)->getType()->isVariablyModifiedType()) {
// Store array size.
++Idx;
Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx,
CGF.getPointerSize());
llvm::Value *Size = CGF.Builder.CreateIntCast(
CGF.getVLASize(
CGF.getContext().getAsVariableArrayType((*IPriv)->getType()))
.first,
CGF.SizeTy, /*isSigned=*/false);
CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(Size, CGF.VoidPtrTy),
Elem);
}
}
// 2. Emit reduce_func().
auto *ReductionFn = emitReductionFunction(
CGM, CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo(), Privates,
LHSExprs, RHSExprs, ReductionOps);
// 3. Create static kmp_critical_name lock = { 0 };
auto *Lock = getCriticalRegionLock(".reduction");
// 4. Build res = __kmpc_reduce{_nowait}(<loc>, <gtid>, <n>, sizeof(RedList),
// RedList, reduce_func, &<lock>);
auto *IdentTLoc = emitUpdateLocation(CGF, Loc, OMP_ATOMIC_REDUCE);
auto *ThreadId = getThreadID(CGF, Loc);
auto *ReductionArrayTySize = CGF.getTypeSize(ReductionArrayTy);
auto *RL = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
ReductionList.getPointer(), CGF.VoidPtrTy);
llvm::Value *Args[] = {
IdentTLoc, // ident_t *<loc>
ThreadId, // i32 <gtid>
CGF.Builder.getInt32(RHSExprs.size()), // i32 <n>
ReductionArrayTySize, // size_type sizeof(RedList)
RL, // void *RedList
ReductionFn, // void (*) (void *, void *) <reduce_func>
Lock // kmp_critical_name *&<lock>
};
auto Res = CGF.EmitRuntimeCall(
createRuntimeFunction(WithNowait ? OMPRTL__kmpc_reduce_nowait
: OMPRTL__kmpc_reduce),
Args);
// 5. Build switch(res)
auto *DefaultBB = CGF.createBasicBlock(".omp.reduction.default");
auto *SwInst = CGF.Builder.CreateSwitch(Res, DefaultBB, /*NumCases=*/2);
// 6. Build case 1:
// ...
// <LHSExprs>[i] = RedOp<i>(*<LHSExprs>[i], *<RHSExprs>[i]);
// ...
// __kmpc_end_reduce{_nowait}(<loc>, <gtid>, &<lock>);
// break;
auto *Case1BB = CGF.createBasicBlock(".omp.reduction.case1");
SwInst->addCase(CGF.Builder.getInt32(1), Case1BB);
CGF.EmitBlock(Case1BB);
// Add emission of __kmpc_end_reduce{_nowait}(<loc>, <gtid>, &<lock>);
llvm::Value *EndArgs[] = {
IdentTLoc, // ident_t *<loc>
ThreadId, // i32 <gtid>
Lock // kmp_critical_name *&<lock>
};
auto &&CodeGen = [&Privates, &LHSExprs, &RHSExprs, &ReductionOps](
CodeGenFunction &CGF, PrePostActionTy &Action) {
auto IPriv = Privates.begin();
auto ILHS = LHSExprs.begin();
auto IRHS = RHSExprs.begin();
for (auto *E : ReductionOps) {
emitSingleReductionCombiner(CGF, E, *IPriv, cast<DeclRefExpr>(*ILHS),
cast<DeclRefExpr>(*IRHS));
++IPriv;
++ILHS;
++IRHS;
}
};
RegionCodeGenTy RCG(CodeGen);
CommonActionTy Action(
nullptr, llvm::None,
createRuntimeFunction(WithNowait ? OMPRTL__kmpc_end_reduce_nowait
: OMPRTL__kmpc_end_reduce),
EndArgs);
RCG.setAction(Action);
RCG(CGF);
CGF.EmitBranch(DefaultBB);
// 7. Build case 2:
// ...
// Atomic(<LHSExprs>[i] = RedOp<i>(*<LHSExprs>[i], *<RHSExprs>[i]));
// ...
// break;
auto *Case2BB = CGF.createBasicBlock(".omp.reduction.case2");
SwInst->addCase(CGF.Builder.getInt32(2), Case2BB);
CGF.EmitBlock(Case2BB);
auto &&AtomicCodeGen = [Loc, &Privates, &LHSExprs, &RHSExprs, &ReductionOps](
CodeGenFunction &CGF, PrePostActionTy &Action) {
auto ILHS = LHSExprs.begin();
auto IRHS = RHSExprs.begin();
auto IPriv = Privates.begin();
for (auto *E : ReductionOps) {
const Expr *XExpr = nullptr;
const Expr *EExpr = nullptr;
const Expr *UpExpr = nullptr;
BinaryOperatorKind BO = BO_Comma;
if (auto *BO = dyn_cast<BinaryOperator>(E)) {
if (BO->getOpcode() == BO_Assign) {
XExpr = BO->getLHS();
UpExpr = BO->getRHS();
}
}
// Try to emit update expression as a simple atomic.
auto *RHSExpr = UpExpr;
if (RHSExpr) {
// Analyze RHS part of the whole expression.
if (auto *ACO = dyn_cast<AbstractConditionalOperator>(
RHSExpr->IgnoreParenImpCasts())) {
// If this is a conditional operator, analyze its condition for
// min/max reduction operator.
RHSExpr = ACO->getCond();
}
if (auto *BORHS =
dyn_cast<BinaryOperator>(RHSExpr->IgnoreParenImpCasts())) {
EExpr = BORHS->getRHS();
BO = BORHS->getOpcode();
}
}
if (XExpr) {
auto *VD = cast<VarDecl>(cast<DeclRefExpr>(*ILHS)->getDecl());
auto &&AtomicRedGen = [BO, VD, IPriv,
Loc](CodeGenFunction &CGF, const Expr *XExpr,
const Expr *EExpr, const Expr *UpExpr) {
LValue X = CGF.EmitLValue(XExpr);
RValue E;
if (EExpr)
E = CGF.EmitAnyExpr(EExpr);
CGF.EmitOMPAtomicSimpleUpdateExpr(
X, E, BO, /*IsXLHSInRHSPart=*/true,
llvm::AtomicOrdering::Monotonic, Loc,
[&CGF, UpExpr, VD, IPriv, Loc](RValue XRValue) {
CodeGenFunction::OMPPrivateScope PrivateScope(CGF);
PrivateScope.addPrivate(
VD, [&CGF, VD, XRValue, Loc]() -> Address {
Address LHSTemp = CGF.CreateMemTemp(VD->getType());
CGF.emitOMPSimpleStore(
CGF.MakeAddrLValue(LHSTemp, VD->getType()), XRValue,
VD->getType().getNonReferenceType(), Loc);
return LHSTemp;
});
(void)PrivateScope.Privatize();
return CGF.EmitAnyExpr(UpExpr);
});
};
if ((*IPriv)->getType()->isArrayType()) {
// Emit atomic reduction for array section.
auto *RHSVar = cast<VarDecl>(cast<DeclRefExpr>(*IRHS)->getDecl());
EmitOMPAggregateReduction(CGF, (*IPriv)->getType(), VD, RHSVar,
AtomicRedGen, XExpr, EExpr, UpExpr);
} else
// Emit atomic reduction for array subscript or single variable.
AtomicRedGen(CGF, XExpr, EExpr, UpExpr);
} else {
// Emit as a critical region.
auto &&CritRedGen = [E, Loc](CodeGenFunction &CGF, const Expr *,
const Expr *, const Expr *) {
auto &RT = CGF.CGM.getOpenMPRuntime();
RT.emitCriticalRegion(
CGF, ".atomic_reduction",
[=](CodeGenFunction &CGF, PrePostActionTy &Action) {
Action.Enter(CGF);
emitReductionCombiner(CGF, E);
},
Loc);
};
if ((*IPriv)->getType()->isArrayType()) {
auto *LHSVar = cast<VarDecl>(cast<DeclRefExpr>(*ILHS)->getDecl());
auto *RHSVar = cast<VarDecl>(cast<DeclRefExpr>(*IRHS)->getDecl());
EmitOMPAggregateReduction(CGF, (*IPriv)->getType(), LHSVar, RHSVar,
CritRedGen);
} else
CritRedGen(CGF, nullptr, nullptr, nullptr);
}
++ILHS;
++IRHS;
++IPriv;
}
};
RegionCodeGenTy AtomicRCG(AtomicCodeGen);
if (!WithNowait) {
// Add emission of __kmpc_end_reduce(<loc>, <gtid>, &<lock>);
llvm::Value *EndArgs[] = {
IdentTLoc, // ident_t *<loc>
ThreadId, // i32 <gtid>
Lock // kmp_critical_name *&<lock>
};
CommonActionTy Action(nullptr, llvm::None,
createRuntimeFunction(OMPRTL__kmpc_end_reduce),
EndArgs);
AtomicRCG.setAction(Action);
AtomicRCG(CGF);
} else
AtomicRCG(CGF);
CGF.EmitBranch(DefaultBB);
CGF.EmitBlock(DefaultBB, /*IsFinished=*/true);
}
void CGOpenMPRuntime::emitTaskwaitCall(CodeGenFunction &CGF,
SourceLocation Loc) {
if (!CGF.HaveInsertPoint())
return;
// Build call kmp_int32 __kmpc_omp_taskwait(ident_t *loc, kmp_int32
// global_tid);
llvm::Value *Args[] = {emitUpdateLocation(CGF, Loc), getThreadID(CGF, Loc)};
// Ignore return result until untied tasks are supported.
CGF.EmitRuntimeCall(createRuntimeFunction(OMPRTL__kmpc_omp_taskwait), Args);
if (auto *Region = dyn_cast_or_null<CGOpenMPRegionInfo>(CGF.CapturedStmtInfo))
Region->emitUntiedSwitch(CGF);
}
void CGOpenMPRuntime::emitInlinedDirective(CodeGenFunction &CGF,
OpenMPDirectiveKind InnerKind,
const RegionCodeGenTy &CodeGen,
bool HasCancel) {
if (!CGF.HaveInsertPoint())
return;
InlinedOpenMPRegionRAII Region(CGF, CodeGen, InnerKind, HasCancel);
CGF.CapturedStmtInfo->EmitBody(CGF, /*S=*/nullptr);
}
namespace {
enum RTCancelKind {
CancelNoreq = 0,
CancelParallel = 1,
CancelLoop = 2,
CancelSections = 3,
CancelTaskgroup = 4
};
} // anonymous namespace
static RTCancelKind getCancellationKind(OpenMPDirectiveKind CancelRegion) {
RTCancelKind CancelKind = CancelNoreq;
if (CancelRegion == OMPD_parallel)
CancelKind = CancelParallel;
else if (CancelRegion == OMPD_for)
CancelKind = CancelLoop;
else if (CancelRegion == OMPD_sections)
CancelKind = CancelSections;
else {
assert(CancelRegion == OMPD_taskgroup);
CancelKind = CancelTaskgroup;
}
return CancelKind;
}
void CGOpenMPRuntime::emitCancellationPointCall(
CodeGenFunction &CGF, SourceLocation Loc,
OpenMPDirectiveKind CancelRegion) {
if (!CGF.HaveInsertPoint())
return;
// Build call kmp_int32 __kmpc_cancellationpoint(ident_t *loc, kmp_int32
// global_tid, kmp_int32 cncl_kind);
if (auto *OMPRegionInfo =
dyn_cast_or_null<CGOpenMPRegionInfo>(CGF.CapturedStmtInfo)) {
- if (OMPRegionInfo->hasCancel()) {
+ // For 'cancellation point taskgroup', the task region info may not have a
+ // cancel. This may instead happen in another adjacent task.
+ if (CancelRegion == OMPD_taskgroup || OMPRegionInfo->hasCancel()) {
llvm::Value *Args[] = {
emitUpdateLocation(CGF, Loc), getThreadID(CGF, Loc),
CGF.Builder.getInt32(getCancellationKind(CancelRegion))};
// Ignore return result until untied tasks are supported.
auto *Result = CGF.EmitRuntimeCall(
createRuntimeFunction(OMPRTL__kmpc_cancellationpoint), Args);
// if (__kmpc_cancellationpoint()) {
// exit from construct;
// }
auto *ExitBB = CGF.createBasicBlock(".cancel.exit");
auto *ContBB = CGF.createBasicBlock(".cancel.continue");
auto *Cmp = CGF.Builder.CreateIsNotNull(Result);
CGF.Builder.CreateCondBr(Cmp, ExitBB, ContBB);
CGF.EmitBlock(ExitBB);
// exit from construct;
auto CancelDest =
CGF.getOMPCancelDestination(OMPRegionInfo->getDirectiveKind());
CGF.EmitBranchThroughCleanup(CancelDest);
CGF.EmitBlock(ContBB, /*IsFinished=*/true);
}
}
}
void CGOpenMPRuntime::emitCancelCall(CodeGenFunction &CGF, SourceLocation Loc,
const Expr *IfCond,
OpenMPDirectiveKind CancelRegion) {
if (!CGF.HaveInsertPoint())
return;
// Build call kmp_int32 __kmpc_cancel(ident_t *loc, kmp_int32 global_tid,
// kmp_int32 cncl_kind);
if (auto *OMPRegionInfo =
dyn_cast_or_null<CGOpenMPRegionInfo>(CGF.CapturedStmtInfo)) {
auto &&ThenGen = [Loc, CancelRegion, OMPRegionInfo](CodeGenFunction &CGF,
PrePostActionTy &) {
auto &RT = CGF.CGM.getOpenMPRuntime();
llvm::Value *Args[] = {
RT.emitUpdateLocation(CGF, Loc), RT.getThreadID(CGF, Loc),
CGF.Builder.getInt32(getCancellationKind(CancelRegion))};
// Ignore return result until untied tasks are supported.
auto *Result = CGF.EmitRuntimeCall(
RT.createRuntimeFunction(OMPRTL__kmpc_cancel), Args);
// if (__kmpc_cancel()) {
// exit from construct;
// }
auto *ExitBB = CGF.createBasicBlock(".cancel.exit");
auto *ContBB = CGF.createBasicBlock(".cancel.continue");
auto *Cmp = CGF.Builder.CreateIsNotNull(Result);
CGF.Builder.CreateCondBr(Cmp, ExitBB, ContBB);
CGF.EmitBlock(ExitBB);
// exit from construct;
auto CancelDest =
CGF.getOMPCancelDestination(OMPRegionInfo->getDirectiveKind());
CGF.EmitBranchThroughCleanup(CancelDest);
CGF.EmitBlock(ContBB, /*IsFinished=*/true);
};
if (IfCond)
emitOMPIfClause(CGF, IfCond, ThenGen,
[](CodeGenFunction &, PrePostActionTy &) {});
else {
RegionCodeGenTy ThenRCG(ThenGen);
ThenRCG(CGF);
}
}
}
/// \brief Obtain information that uniquely identifies a target entry. This
/// consists of the file and device IDs as well as line number associated with
/// the relevant entry source location.
static void getTargetEntryUniqueInfo(ASTContext &C, SourceLocation Loc,
unsigned &DeviceID, unsigned &FileID,
unsigned &LineNum) {
auto &SM = C.getSourceManager();
// The loc should be always valid and have a file ID (the user cannot use
// #pragma directives in macros)
assert(Loc.isValid() && "Source location is expected to be always valid.");
assert(Loc.isFileID() && "Source location is expected to refer to a file.");
PresumedLoc PLoc = SM.getPresumedLoc(Loc);
assert(PLoc.isValid() && "Source location is expected to be always valid.");
llvm::sys::fs::UniqueID ID;
if (llvm::sys::fs::getUniqueID(PLoc.getFilename(), ID))
llvm_unreachable("Source file with target region no longer exists!");
DeviceID = ID.getDevice();
FileID = ID.getFile();
LineNum = PLoc.getLine();
}
void CGOpenMPRuntime::emitTargetOutlinedFunction(
const OMPExecutableDirective &D, StringRef ParentName,
llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID,
bool IsOffloadEntry, const RegionCodeGenTy &CodeGen) {
assert(!ParentName.empty() && "Invalid target region parent name!");
emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
IsOffloadEntry, CodeGen);
}
void CGOpenMPRuntime::emitTargetOutlinedFunctionHelper(
const OMPExecutableDirective &D, StringRef ParentName,
llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID,
bool IsOffloadEntry, const RegionCodeGenTy &CodeGen) {
// Create a unique name for the entry function using the source location
// information of the current target region. The name will be something like:
//
// __omp_offloading_DD_FFFF_PP_lBB
//
// where DD_FFFF is an ID unique to the file (device and file IDs), PP is the
// mangled name of the function that encloses the target region and BB is the
// line number of the target region.
unsigned DeviceID;
unsigned FileID;
unsigned Line;
getTargetEntryUniqueInfo(CGM.getContext(), D.getLocStart(), DeviceID, FileID,
Line);
SmallString<64> EntryFnName;
{
llvm::raw_svector_ostream OS(EntryFnName);
OS << "__omp_offloading" << llvm::format("_%x", DeviceID)
<< llvm::format("_%x_", FileID) << ParentName << "_l" << Line;
}
const CapturedStmt &CS = *cast<CapturedStmt>(D.getAssociatedStmt());
CodeGenFunction CGF(CGM, true);
CGOpenMPTargetRegionInfo CGInfo(CS, CodeGen, EntryFnName);
CodeGenFunction::CGCapturedStmtRAII CapInfoRAII(CGF, &CGInfo);
OutlinedFn = CGF.GenerateOpenMPCapturedStmtFunction(CS);
// If this target outline function is not an offload entry, we don't need to
// register it.
if (!IsOffloadEntry)
return;
// The target region ID is used by the runtime library to identify the current
// target region, so it only has to be unique and not necessarily point to
// anything. It could be the pointer to the outlined function that implements
// the target region, but we aren't using that so that the compiler doesn't
// need to keep that, and could therefore inline the host function if proven
// worthwhile during optimization. In the other hand, if emitting code for the
// device, the ID has to be the function address so that it can retrieved from
// the offloading entry and launched by the runtime library. We also mark the
// outlined function to have external linkage in case we are emitting code for
// the device, because these functions will be entry points to the device.
if (CGM.getLangOpts().OpenMPIsDevice) {
OutlinedFnID = llvm::ConstantExpr::getBitCast(OutlinedFn, CGM.Int8PtrTy);
OutlinedFn->setLinkage(llvm::GlobalValue::ExternalLinkage);
} else
OutlinedFnID = new llvm::GlobalVariable(
CGM.getModule(), CGM.Int8Ty, /*isConstant=*/true,
llvm::GlobalValue::PrivateLinkage,
llvm::Constant::getNullValue(CGM.Int8Ty), ".omp_offload.region_id");
// Register the information for the entry associated with this target region.
OffloadEntriesInfoManager.registerTargetRegionEntryInfo(
DeviceID, FileID, ParentName, Line, OutlinedFn, OutlinedFnID,
/*Flags=*/0);
}
/// discard all CompoundStmts intervening between two constructs
static const Stmt *ignoreCompoundStmts(const Stmt *Body) {
while (auto *CS = dyn_cast_or_null<CompoundStmt>(Body))
Body = CS->body_front();
return Body;
}
/// \brief Emit the num_teams clause of an enclosed teams directive at the
/// target region scope. If there is no teams directive associated with the
/// target directive, or if there is no num_teams clause associated with the
/// enclosed teams directive, return nullptr.
static llvm::Value *
emitNumTeamsClauseForTargetDirective(CGOpenMPRuntime &OMPRuntime,
CodeGenFunction &CGF,
const OMPExecutableDirective &D) {
assert(!CGF.getLangOpts().OpenMPIsDevice && "Clauses associated with the "
"teams directive expected to be "
"emitted only for the host!");
// FIXME: For the moment we do not support combined directives with target and
// teams, so we do not expect to get any num_teams clause in the provided
// directive. Once we support that, this assertion can be replaced by the
// actual emission of the clause expression.
assert(D.getSingleClause<OMPNumTeamsClause>() == nullptr &&
"Not expecting clause in directive.");
// If the current target region has a teams region enclosed, we need to get
// the number of teams to pass to the runtime function call. This is done
// by generating the expression in a inlined region. This is required because
// the expression is captured in the enclosing target environment when the
// teams directive is not combined with target.
const CapturedStmt &CS = *cast<CapturedStmt>(D.getAssociatedStmt());
// FIXME: Accommodate other combined directives with teams when they become
// available.
if (auto *TeamsDir = dyn_cast_or_null<OMPTeamsDirective>(
ignoreCompoundStmts(CS.getCapturedStmt()))) {
if (auto *NTE = TeamsDir->getSingleClause<OMPNumTeamsClause>()) {
CGOpenMPInnerExprInfo CGInfo(CGF, CS);
CodeGenFunction::CGCapturedStmtRAII CapInfoRAII(CGF, &CGInfo);
llvm::Value *NumTeams = CGF.EmitScalarExpr(NTE->getNumTeams());
return CGF.Builder.CreateIntCast(NumTeams, CGF.Int32Ty,
/*IsSigned=*/true);
}
// If we have an enclosed teams directive but no num_teams clause we use
// the default value 0.
return CGF.Builder.getInt32(0);
}
// No teams associated with the directive.
return nullptr;
}
/// \brief Emit the thread_limit clause of an enclosed teams directive at the
/// target region scope. If there is no teams directive associated with the
/// target directive, or if there is no thread_limit clause associated with the
/// enclosed teams directive, return nullptr.
static llvm::Value *
emitThreadLimitClauseForTargetDirective(CGOpenMPRuntime &OMPRuntime,
CodeGenFunction &CGF,
const OMPExecutableDirective &D) {
assert(!CGF.getLangOpts().OpenMPIsDevice && "Clauses associated with the "
"teams directive expected to be "
"emitted only for the host!");
// FIXME: For the moment we do not support combined directives with target and
// teams, so we do not expect to get any thread_limit clause in the provided
// directive. Once we support that, this assertion can be replaced by the
// actual emission of the clause expression.
assert(D.getSingleClause<OMPThreadLimitClause>() == nullptr &&
"Not expecting clause in directive.");
// If the current target region has a teams region enclosed, we need to get
// the thread limit to pass to the runtime function call. This is done
// by generating the expression in a inlined region. This is required because
// the expression is captured in the enclosing target environment when the
// teams directive is not combined with target.
const CapturedStmt &CS = *cast<CapturedStmt>(D.getAssociatedStmt());
// FIXME: Accommodate other combined directives with teams when they become
// available.
if (auto *TeamsDir = dyn_cast_or_null<OMPTeamsDirective>(
ignoreCompoundStmts(CS.getCapturedStmt()))) {
if (auto *TLE = TeamsDir->getSingleClause<OMPThreadLimitClause>()) {
CGOpenMPInnerExprInfo CGInfo(CGF, CS);
CodeGenFunction::CGCapturedStmtRAII CapInfoRAII(CGF, &CGInfo);
llvm::Value *ThreadLimit = CGF.EmitScalarExpr(TLE->getThreadLimit());
return CGF.Builder.CreateIntCast(ThreadLimit, CGF.Int32Ty,
/*IsSigned=*/true);
}
// If we have an enclosed teams directive but no thread_limit clause we use
// the default value 0.
return CGF.Builder.getInt32(0);
}
// No teams associated with the directive.
return nullptr;
}
namespace {
// \brief Utility to handle information from clauses associated with a given
// construct that use mappable expressions (e.g. 'map' clause, 'to' clause).
// It provides a convenient interface to obtain the information and generate
// code for that information.
class MappableExprsHandler {
public:
/// \brief Values for bit flags used to specify the mapping type for
/// offloading.
enum OpenMPOffloadMappingFlags {
/// \brief Allocate memory on the device and move data from host to device.
OMP_MAP_TO = 0x01,
/// \brief Allocate memory on the device and move data from device to host.
OMP_MAP_FROM = 0x02,
/// \brief Always perform the requested mapping action on the element, even
/// if it was already mapped before.
OMP_MAP_ALWAYS = 0x04,
/// \brief Delete the element from the device environment, ignoring the
/// current reference count associated with the element.
OMP_MAP_DELETE = 0x08,
/// \brief The element being mapped is a pointer, therefore the pointee
/// should be mapped as well.
OMP_MAP_IS_PTR = 0x10,
/// \brief This flags signals that an argument is the first one relating to
/// a map/private clause expression. For some cases a single
/// map/privatization results in multiple arguments passed to the runtime
/// library.
OMP_MAP_FIRST_REF = 0x20,
/// \brief Signal that the runtime library has to return the device pointer
/// in the current position for the data being mapped.
OMP_MAP_RETURN_PTR = 0x40,
/// \brief This flag signals that the reference being passed is a pointer to
/// private data.
OMP_MAP_PRIVATE_PTR = 0x80,
/// \brief Pass the element to the device by value.
OMP_MAP_PRIVATE_VAL = 0x100,
};
/// Class that associates information with a base pointer to be passed to the
/// runtime library.
class BasePointerInfo {
/// The base pointer.
llvm::Value *Ptr = nullptr;
/// The base declaration that refers to this device pointer, or null if
/// there is none.
const ValueDecl *DevPtrDecl = nullptr;
public:
BasePointerInfo(llvm::Value *Ptr, const ValueDecl *DevPtrDecl = nullptr)
: Ptr(Ptr), DevPtrDecl(DevPtrDecl) {}
llvm::Value *operator*() const { return Ptr; }
const ValueDecl *getDevicePtrDecl() const { return DevPtrDecl; }
void setDevicePtrDecl(const ValueDecl *D) { DevPtrDecl = D; }
};
typedef SmallVector<BasePointerInfo, 16> MapBaseValuesArrayTy;
typedef SmallVector<llvm::Value *, 16> MapValuesArrayTy;
typedef SmallVector<unsigned, 16> MapFlagsArrayTy;
private:
/// \brief Directive from where the map clauses were extracted.
const OMPExecutableDirective &CurDir;
/// \brief Function the directive is being generated for.
CodeGenFunction &CGF;
/// \brief Set of all first private variables in the current directive.
llvm::SmallPtrSet<const VarDecl *, 8> FirstPrivateDecls;
/// Map between device pointer declarations and their expression components.
/// The key value for declarations in 'this' is null.
llvm::DenseMap<
const ValueDecl *,
SmallVector<OMPClauseMappableExprCommon::MappableExprComponentListRef, 4>>
DevPointersMap;
llvm::Value *getExprTypeSize(const Expr *E) const {
auto ExprTy = E->getType().getCanonicalType();
// Reference types are ignored for mapping purposes.
if (auto *RefTy = ExprTy->getAs<ReferenceType>())
ExprTy = RefTy->getPointeeType().getCanonicalType();
// Given that an array section is considered a built-in type, we need to
// do the calculation based on the length of the section instead of relying
// on CGF.getTypeSize(E->getType()).
if (const auto *OAE = dyn_cast<OMPArraySectionExpr>(E)) {
QualType BaseTy = OMPArraySectionExpr::getBaseOriginalType(
OAE->getBase()->IgnoreParenImpCasts())
.getCanonicalType();
// If there is no length associated with the expression, that means we
// are using the whole length of the base.
if (!OAE->getLength() && OAE->getColonLoc().isValid())
return CGF.getTypeSize(BaseTy);
llvm::Value *ElemSize;
if (auto *PTy = BaseTy->getAs<PointerType>())
ElemSize = CGF.getTypeSize(PTy->getPointeeType().getCanonicalType());
else {
auto *ATy = cast<ArrayType>(BaseTy.getTypePtr());
assert(ATy && "Expecting array type if not a pointer type.");
ElemSize = CGF.getTypeSize(ATy->getElementType().getCanonicalType());
}
// If we don't have a length at this point, that is because we have an
// array section with a single element.
if (!OAE->getLength())
return ElemSize;
auto *LengthVal = CGF.EmitScalarExpr(OAE->getLength());
LengthVal =
CGF.Builder.CreateIntCast(LengthVal, CGF.SizeTy, /*isSigned=*/false);
return CGF.Builder.CreateNUWMul(LengthVal, ElemSize);
}
return CGF.getTypeSize(ExprTy);
}
/// \brief Return the corresponding bits for a given map clause modifier. Add
/// a flag marking the map as a pointer if requested. Add a flag marking the
/// map as the first one of a series of maps that relate to the same map
/// expression.
unsigned getMapTypeBits(OpenMPMapClauseKind MapType,
OpenMPMapClauseKind MapTypeModifier, bool AddPtrFlag,
bool AddIsFirstFlag) const {
unsigned Bits = 0u;
switch (MapType) {
case OMPC_MAP_alloc:
case OMPC_MAP_release:
// alloc and release is the default behavior in the runtime library, i.e.
// if we don't pass any bits alloc/release that is what the runtime is
// going to do. Therefore, we don't need to signal anything for these two
// type modifiers.
break;
case OMPC_MAP_to:
Bits = OMP_MAP_TO;
break;
case OMPC_MAP_from:
Bits = OMP_MAP_FROM;
break;
case OMPC_MAP_tofrom:
Bits = OMP_MAP_TO | OMP_MAP_FROM;
break;
case OMPC_MAP_delete:
Bits = OMP_MAP_DELETE;
break;
default:
llvm_unreachable("Unexpected map type!");
break;
}
if (AddPtrFlag)
Bits |= OMP_MAP_IS_PTR;
if (AddIsFirstFlag)
Bits |= OMP_MAP_FIRST_REF;
if (MapTypeModifier == OMPC_MAP_always)
Bits |= OMP_MAP_ALWAYS;
return Bits;
}
/// \brief Return true if the provided expression is a final array section. A
/// final array section, is one whose length can't be proved to be one.
bool isFinalArraySectionExpression(const Expr *E) const {
auto *OASE = dyn_cast<OMPArraySectionExpr>(E);
// It is not an array section and therefore not a unity-size one.
if (!OASE)
return false;
// An array section with no colon always refer to a single element.
if (OASE->getColonLoc().isInvalid())
return false;
auto *Length = OASE->getLength();
// If we don't have a length we have to check if the array has size 1
// for this dimension. Also, we should always expect a length if the
// base type is pointer.
if (!Length) {
auto BaseQTy = OMPArraySectionExpr::getBaseOriginalType(
OASE->getBase()->IgnoreParenImpCasts())
.getCanonicalType();
if (auto *ATy = dyn_cast<ConstantArrayType>(BaseQTy.getTypePtr()))
return ATy->getSize().getSExtValue() != 1;
// If we don't have a constant dimension length, we have to consider
// the current section as having any size, so it is not necessarily
// unitary. If it happen to be unity size, that's user fault.
return true;
}
// Check if the length evaluates to 1.
llvm::APSInt ConstLength;
if (!Length->EvaluateAsInt(ConstLength, CGF.getContext()))
return true; // Can have more that size 1.
return ConstLength.getSExtValue() != 1;
}
/// \brief Generate the base pointers, section pointers, sizes and map type
/// bits for the provided map type, map modifier, and expression components.
/// \a IsFirstComponent should be set to true if the provided set of
/// components is the first associated with a capture.
void generateInfoForComponentList(
OpenMPMapClauseKind MapType, OpenMPMapClauseKind MapTypeModifier,
OMPClauseMappableExprCommon::MappableExprComponentListRef Components,
MapBaseValuesArrayTy &BasePointers, MapValuesArrayTy &Pointers,
MapValuesArrayTy &Sizes, MapFlagsArrayTy &Types,
bool IsFirstComponentList) const {
// The following summarizes what has to be generated for each map and the
// types bellow. The generated information is expressed in this order:
// base pointer, section pointer, size, flags
// (to add to the ones that come from the map type and modifier).
//
// double d;
// int i[100];
// float *p;
//
// struct S1 {
// int i;
// float f[50];
// }
// struct S2 {
// int i;
// float f[50];
// S1 s;
// double *p;
// struct S2 *ps;
// }
// S2 s;
// S2 *ps;
//
// map(d)
// &d, &d, sizeof(double), noflags
//
// map(i)
// &i, &i, 100*sizeof(int), noflags
//
// map(i[1:23])
// &i(=&i[0]), &i[1], 23*sizeof(int), noflags
//
// map(p)
// &p, &p, sizeof(float*), noflags
//
// map(p[1:24])
// p, &p[1], 24*sizeof(float), noflags
//
// map(s)
// &s, &s, sizeof(S2), noflags
//
// map(s.i)
// &s, &(s.i), sizeof(int), noflags
//
// map(s.s.f)
// &s, &(s.i.f), 50*sizeof(int), noflags
//
// map(s.p)
// &s, &(s.p), sizeof(double*), noflags
//
// map(s.p[:22], s.a s.b)
// &s, &(s.p), sizeof(double*), noflags
// &(s.p), &(s.p[0]), 22*sizeof(double), ptr_flag + extra_flag
//
// map(s.ps)
// &s, &(s.ps), sizeof(S2*), noflags
//
// map(s.ps->s.i)
// &s, &(s.ps), sizeof(S2*), noflags
// &(s.ps), &(s.ps->s.i), sizeof(int), ptr_flag + extra_flag
//
// map(s.ps->ps)
// &s, &(s.ps), sizeof(S2*), noflags
// &(s.ps), &(s.ps->ps), sizeof(S2*), ptr_flag + extra_flag
//
// map(s.ps->ps->ps)
// &s, &(s.ps), sizeof(S2*), noflags
// &(s.ps), &(s.ps->ps), sizeof(S2*), ptr_flag + extra_flag
// &(s.ps->ps), &(s.ps->ps->ps), sizeof(S2*), ptr_flag + extra_flag
//
// map(s.ps->ps->s.f[:22])
// &s, &(s.ps), sizeof(S2*), noflags
// &(s.ps), &(s.ps->ps), sizeof(S2*), ptr_flag + extra_flag
// &(s.ps->ps), &(s.ps->ps->s.f[0]), 22*sizeof(float), ptr_flag + extra_flag
//
// map(ps)
// &ps, &ps, sizeof(S2*), noflags
//
// map(ps->i)
// ps, &(ps->i), sizeof(int), noflags
//
// map(ps->s.f)
// ps, &(ps->s.f[0]), 50*sizeof(float), noflags
//
// map(ps->p)
// ps, &(ps->p), sizeof(double*), noflags
//
// map(ps->p[:22])
// ps, &(ps->p), sizeof(double*), noflags
// &(ps->p), &(ps->p[0]), 22*sizeof(double), ptr_flag + extra_flag
//
// map(ps->ps)
// ps, &(ps->ps), sizeof(S2*), noflags
//
// map(ps->ps->s.i)
// ps, &(ps->ps), sizeof(S2*), noflags
// &(ps->ps), &(ps->ps->s.i), sizeof(int), ptr_flag + extra_flag
//
// map(ps->ps->ps)
// ps, &(ps->ps), sizeof(S2*), noflags
// &(ps->ps), &(ps->ps->ps), sizeof(S2*), ptr_flag + extra_flag
//
// map(ps->ps->ps->ps)
// ps, &(ps->ps), sizeof(S2*), noflags
// &(ps->ps), &(ps->ps->ps), sizeof(S2*), ptr_flag + extra_flag
// &(ps->ps->ps), &(ps->ps->ps->ps), sizeof(S2*), ptr_flag + extra_flag
//
// map(ps->ps->ps->s.f[:22])
// ps, &(ps->ps), sizeof(S2*), noflags
// &(ps->ps), &(ps->ps->ps), sizeof(S2*), ptr_flag + extra_flag
// &(ps->ps->ps), &(ps->ps->ps->s.f[0]), 22*sizeof(float), ptr_flag +
// extra_flag
// Track if the map information being generated is the first for a capture.
bool IsCaptureFirstInfo = IsFirstComponentList;
// Scan the components from the base to the complete expression.
auto CI = Components.rbegin();
auto CE = Components.rend();
auto I = CI;
// Track if the map information being generated is the first for a list of
// components.
bool IsExpressionFirstInfo = true;
llvm::Value *BP = nullptr;
if (auto *ME = dyn_cast<MemberExpr>(I->getAssociatedExpression())) {
// The base is the 'this' pointer. The content of the pointer is going
// to be the base of the field being mapped.
BP = CGF.EmitScalarExpr(ME->getBase());
} else {
// The base is the reference to the variable.
// BP = &Var.
BP = CGF.EmitLValue(cast<DeclRefExpr>(I->getAssociatedExpression()))
.getPointer();
// If the variable is a pointer and is being dereferenced (i.e. is not
// the last component), the base has to be the pointer itself, not its
// reference. References are ignored for mapping purposes.
QualType Ty =
I->getAssociatedDeclaration()->getType().getNonReferenceType();
if (Ty->isAnyPointerType() && std::next(I) != CE) {
auto PtrAddr = CGF.MakeNaturalAlignAddrLValue(BP, Ty);
BP = CGF.EmitLoadOfPointerLValue(PtrAddr.getAddress(),
Ty->castAs<PointerType>())
.getPointer();
// We do not need to generate individual map information for the
// pointer, it can be associated with the combined storage.
++I;
}
}
for (; I != CE; ++I) {
auto Next = std::next(I);
// We need to generate the addresses and sizes if this is the last
// component, if the component is a pointer or if it is an array section
// whose length can't be proved to be one. If this is a pointer, it
// becomes the base address for the following components.
// A final array section, is one whose length can't be proved to be one.
bool IsFinalArraySection =
isFinalArraySectionExpression(I->getAssociatedExpression());
// Get information on whether the element is a pointer. Have to do a
// special treatment for array sections given that they are built-in
// types.
const auto *OASE =
dyn_cast<OMPArraySectionExpr>(I->getAssociatedExpression());
bool IsPointer =
(OASE &&
OMPArraySectionExpr::getBaseOriginalType(OASE)
.getCanonicalType()
->isAnyPointerType()) ||
I->getAssociatedExpression()->getType()->isAnyPointerType();
if (Next == CE || IsPointer || IsFinalArraySection) {
// If this is not the last component, we expect the pointer to be
// associated with an array expression or member expression.
assert((Next == CE ||
isa<MemberExpr>(Next->getAssociatedExpression()) ||
isa<ArraySubscriptExpr>(Next->getAssociatedExpression()) ||
isa<OMPArraySectionExpr>(Next->getAssociatedExpression())) &&
"Unexpected expression");
auto *LB = CGF.EmitLValue(I->getAssociatedExpression()).getPointer();
auto *Size = getExprTypeSize(I->getAssociatedExpression());
// If we have a member expression and the current component is a
// reference, we have to map the reference too. Whenever we have a
// reference, the section that reference refers to is going to be a
// load instruction from the storage assigned to the reference.
if (isa<MemberExpr>(I->getAssociatedExpression()) &&
I->getAssociatedDeclaration()->getType()->isReferenceType()) {
auto *LI = cast<llvm::LoadInst>(LB);
auto *RefAddr = LI->getPointerOperand();
BasePointers.push_back(BP);
Pointers.push_back(RefAddr);
Sizes.push_back(CGF.getTypeSize(CGF.getContext().VoidPtrTy));
Types.push_back(getMapTypeBits(
/*MapType*/ OMPC_MAP_alloc, /*MapTypeModifier=*/OMPC_MAP_unknown,
!IsExpressionFirstInfo, IsCaptureFirstInfo));
IsExpressionFirstInfo = false;
IsCaptureFirstInfo = false;
// The reference will be the next base address.
BP = RefAddr;
}
BasePointers.push_back(BP);
Pointers.push_back(LB);
Sizes.push_back(Size);
// We need to add a pointer flag for each map that comes from the
// same expression except for the first one. We also need to signal
// this map is the first one that relates with the current capture
// (there is a set of entries for each capture).
Types.push_back(getMapTypeBits(MapType, MapTypeModifier,
!IsExpressionFirstInfo,
IsCaptureFirstInfo));
// If we have a final array section, we are done with this expression.
if (IsFinalArraySection)
break;
// The pointer becomes the base for the next element.
if (Next != CE)
BP = LB;
IsExpressionFirstInfo = false;
IsCaptureFirstInfo = false;
continue;
}
}
}
/// \brief Return the adjusted map modifiers if the declaration a capture
/// refers to appears in a first-private clause. This is expected to be used
/// only with directives that start with 'target'.
unsigned adjustMapModifiersForPrivateClauses(const CapturedStmt::Capture &Cap,
unsigned CurrentModifiers) {
assert(Cap.capturesVariable() && "Expected capture by reference only!");
// A first private variable captured by reference will use only the
// 'private ptr' and 'map to' flag. Return the right flags if the captured
// declaration is known as first-private in this handler.
if (FirstPrivateDecls.count(Cap.getCapturedVar()))
return MappableExprsHandler::OMP_MAP_PRIVATE_PTR |
MappableExprsHandler::OMP_MAP_TO;
// We didn't modify anything.
return CurrentModifiers;
}
public:
MappableExprsHandler(const OMPExecutableDirective &Dir, CodeGenFunction &CGF)
: CurDir(Dir), CGF(CGF) {
// Extract firstprivate clause information.
for (const auto *C : Dir.getClausesOfKind<OMPFirstprivateClause>())
for (const auto *D : C->varlists())
FirstPrivateDecls.insert(
cast<VarDecl>(cast<DeclRefExpr>(D)->getDecl())->getCanonicalDecl());
// Extract device pointer clause information.
for (const auto *C : Dir.getClausesOfKind<OMPIsDevicePtrClause>())
for (auto L : C->component_lists())
DevPointersMap[L.first].push_back(L.second);
}
/// \brief Generate all the base pointers, section pointers, sizes and map
/// types for the extracted mappable expressions. Also, for each item that
/// relates with a device pointer, a pair of the relevant declaration and
/// index where it occurs is appended to the device pointers info array.
void generateAllInfo(MapBaseValuesArrayTy &BasePointers,
MapValuesArrayTy &Pointers, MapValuesArrayTy &Sizes,
MapFlagsArrayTy &Types) const {
BasePointers.clear();
Pointers.clear();
Sizes.clear();
Types.clear();
struct MapInfo {
/// Kind that defines how a device pointer has to be returned.
enum ReturnPointerKind {
// Don't have to return any pointer.
RPK_None,
// Pointer is the base of the declaration.
RPK_Base,
// Pointer is a member of the base declaration - 'this'
RPK_Member,
// Pointer is a reference and a member of the base declaration - 'this'
RPK_MemberReference,
};
OMPClauseMappableExprCommon::MappableExprComponentListRef Components;
OpenMPMapClauseKind MapType;
OpenMPMapClauseKind MapTypeModifier;
ReturnPointerKind ReturnDevicePointer;
MapInfo()
: MapType(OMPC_MAP_unknown), MapTypeModifier(OMPC_MAP_unknown),
ReturnDevicePointer(RPK_None) {}
MapInfo(
OMPClauseMappableExprCommon::MappableExprComponentListRef Components,
OpenMPMapClauseKind MapType, OpenMPMapClauseKind MapTypeModifier,
ReturnPointerKind ReturnDevicePointer)
: Components(Components), MapType(MapType),
MapTypeModifier(MapTypeModifier),
ReturnDevicePointer(ReturnDevicePointer) {}
};
// We have to process the component lists that relate with the same
// declaration in a single chunk so that we can generate the map flags
// correctly. Therefore, we organize all lists in a map.
llvm::DenseMap<const ValueDecl *, SmallVector<MapInfo, 8>> Info;
// Helper function to fill the information map for the different supported
// clauses.
auto &&InfoGen = [&Info](
const ValueDecl *D,
OMPClauseMappableExprCommon::MappableExprComponentListRef L,
OpenMPMapClauseKind MapType, OpenMPMapClauseKind MapModifier,
MapInfo::ReturnPointerKind ReturnDevicePointer) {
const ValueDecl *VD =
D ? cast<ValueDecl>(D->getCanonicalDecl()) : nullptr;
Info[VD].push_back({L, MapType, MapModifier, ReturnDevicePointer});
};
// FIXME: MSVC 2013 seems to require this-> to find member CurDir.
for (auto *C : this->CurDir.getClausesOfKind<OMPMapClause>())
for (auto L : C->component_lists())
InfoGen(L.first, L.second, C->getMapType(), C->getMapTypeModifier(),
MapInfo::RPK_None);
for (auto *C : this->CurDir.getClausesOfKind<OMPToClause>())
for (auto L : C->component_lists())
InfoGen(L.first, L.second, OMPC_MAP_to, OMPC_MAP_unknown,
MapInfo::RPK_None);
for (auto *C : this->CurDir.getClausesOfKind<OMPFromClause>())
for (auto L : C->component_lists())
InfoGen(L.first, L.second, OMPC_MAP_from, OMPC_MAP_unknown,
MapInfo::RPK_None);
// Look at the use_device_ptr clause information and mark the existing map
// entries as such. If there is no map information for an entry in the
// use_device_ptr list, we create one with map type 'alloc' and zero size
// section. It is the user fault if that was not mapped before.
// FIXME: MSVC 2013 seems to require this-> to find member CurDir.
for (auto *C : this->CurDir.getClausesOfKind<OMPUseDevicePtrClause>())
for (auto L : C->component_lists()) {
assert(!L.second.empty() && "Not expecting empty list of components!");
const ValueDecl *VD = L.second.back().getAssociatedDeclaration();
VD = cast<ValueDecl>(VD->getCanonicalDecl());
auto *IE = L.second.back().getAssociatedExpression();
// If the first component is a member expression, we have to look into
// 'this', which maps to null in the map of map information. Otherwise
// look directly for the information.
auto It = Info.find(isa<MemberExpr>(IE) ? nullptr : VD);
// We potentially have map information for this declaration already.
// Look for the first set of components that refer to it.
if (It != Info.end()) {
auto CI = std::find_if(
It->second.begin(), It->second.end(), [VD](const MapInfo &MI) {
return MI.Components.back().getAssociatedDeclaration() == VD;
});
// If we found a map entry, signal that the pointer has to be returned
// and move on to the next declaration.
if (CI != It->second.end()) {
CI->ReturnDevicePointer = isa<MemberExpr>(IE)
? (VD->getType()->isReferenceType()
? MapInfo::RPK_MemberReference
: MapInfo::RPK_Member)
: MapInfo::RPK_Base;
continue;
}
}
// We didn't find any match in our map information - generate a zero
// size array section.
// FIXME: MSVC 2013 seems to require this-> to find member CGF.
llvm::Value *Ptr =
this->CGF
.EmitLoadOfLValue(this->CGF.EmitLValue(IE), SourceLocation())
.getScalarVal();
BasePointers.push_back({Ptr, VD});
Pointers.push_back(Ptr);
Sizes.push_back(llvm::Constant::getNullValue(this->CGF.SizeTy));
Types.push_back(OMP_MAP_RETURN_PTR | OMP_MAP_FIRST_REF);
}
for (auto &M : Info) {
// We need to know when we generate information for the first component
// associated with a capture, because the mapping flags depend on it.
bool IsFirstComponentList = true;
for (MapInfo &L : M.second) {
assert(!L.Components.empty() &&
"Not expecting declaration with no component lists.");
// Remember the current base pointer index.
unsigned CurrentBasePointersIdx = BasePointers.size();
// FIXME: MSVC 2013 seems to require this-> to find the member method.
this->generateInfoForComponentList(L.MapType, L.MapTypeModifier,
L.Components, BasePointers, Pointers,
Sizes, Types, IsFirstComponentList);
// If this entry relates with a device pointer, set the relevant
// declaration and add the 'return pointer' flag.
if (IsFirstComponentList &&
L.ReturnDevicePointer != MapInfo::RPK_None) {
// If the pointer is not the base of the map, we need to skip the
// base. If it is a reference in a member field, we also need to skip
// the map of the reference.
if (L.ReturnDevicePointer != MapInfo::RPK_Base) {
++CurrentBasePointersIdx;
if (L.ReturnDevicePointer == MapInfo::RPK_MemberReference)
++CurrentBasePointersIdx;
}
assert(BasePointers.size() > CurrentBasePointersIdx &&
"Unexpected number of mapped base pointers.");
auto *RelevantVD = L.Components.back().getAssociatedDeclaration();
assert(RelevantVD &&
"No relevant declaration related with device pointer??");
BasePointers[CurrentBasePointersIdx].setDevicePtrDecl(RelevantVD);
Types[CurrentBasePointersIdx] |= OMP_MAP_RETURN_PTR;
}
IsFirstComponentList = false;
}
}
}
/// \brief Generate the base pointers, section pointers, sizes and map types
/// associated to a given capture.
void generateInfoForCapture(const CapturedStmt::Capture *Cap,
llvm::Value *Arg,
MapBaseValuesArrayTy &BasePointers,
MapValuesArrayTy &Pointers,
MapValuesArrayTy &Sizes,
MapFlagsArrayTy &Types) const {
assert(!Cap->capturesVariableArrayType() &&
"Not expecting to generate map info for a variable array type!");
BasePointers.clear();
Pointers.clear();
Sizes.clear();
Types.clear();
// We need to know when we generating information for the first component
// associated with a capture, because the mapping flags depend on it.
bool IsFirstComponentList = true;
const ValueDecl *VD =
Cap->capturesThis()
? nullptr
: cast<ValueDecl>(Cap->getCapturedVar()->getCanonicalDecl());
// If this declaration appears in a is_device_ptr clause we just have to
// pass the pointer by value. If it is a reference to a declaration, we just
// pass its value, otherwise, if it is a member expression, we need to map
// 'to' the field.
if (!VD) {
auto It = DevPointersMap.find(VD);
if (It != DevPointersMap.end()) {
for (auto L : It->second) {
generateInfoForComponentList(
/*MapType=*/OMPC_MAP_to, /*MapTypeModifier=*/OMPC_MAP_unknown, L,
BasePointers, Pointers, Sizes, Types, IsFirstComponentList);
IsFirstComponentList = false;
}
return;
}
} else if (DevPointersMap.count(VD)) {
BasePointers.push_back({Arg, VD});
Pointers.push_back(Arg);
Sizes.push_back(CGF.getTypeSize(CGF.getContext().VoidPtrTy));
Types.push_back(OMP_MAP_PRIVATE_VAL | OMP_MAP_FIRST_REF);
return;
}
// FIXME: MSVC 2013 seems to require this-> to find member CurDir.
for (auto *C : this->CurDir.getClausesOfKind<OMPMapClause>())
for (auto L : C->decl_component_lists(VD)) {
assert(L.first == VD &&
"We got information for the wrong declaration??");
assert(!L.second.empty() &&
"Not expecting declaration with no component lists.");
generateInfoForComponentList(C->getMapType(), C->getMapTypeModifier(),
L.second, BasePointers, Pointers, Sizes,
Types, IsFirstComponentList);
IsFirstComponentList = false;
}
return;
}
/// \brief Generate the default map information for a given capture \a CI,
/// record field declaration \a RI and captured value \a CV.
void generateDefaultMapInfo(const CapturedStmt::Capture &CI,
const FieldDecl &RI, llvm::Value *CV,
MapBaseValuesArrayTy &CurBasePointers,
MapValuesArrayTy &CurPointers,
MapValuesArrayTy &CurSizes,
MapFlagsArrayTy &CurMapTypes) {
// Do the default mapping.
if (CI.capturesThis()) {
CurBasePointers.push_back(CV);
CurPointers.push_back(CV);
const PointerType *PtrTy = cast<PointerType>(RI.getType().getTypePtr());
CurSizes.push_back(CGF.getTypeSize(PtrTy->getPointeeType()));
// Default map type.
CurMapTypes.push_back(OMP_MAP_TO | OMP_MAP_FROM);
} else if (CI.capturesVariableByCopy()) {
CurBasePointers.push_back(CV);
CurPointers.push_back(CV);
if (!RI.getType()->isAnyPointerType()) {
// We have to signal to the runtime captures passed by value that are
// not pointers.
CurMapTypes.push_back(OMP_MAP_PRIVATE_VAL);
CurSizes.push_back(CGF.getTypeSize(RI.getType()));
} else {
// Pointers are implicitly mapped with a zero size and no flags
// (other than first map that is added for all implicit maps).
CurMapTypes.push_back(0u);
CurSizes.push_back(llvm::Constant::getNullValue(CGF.SizeTy));
}
} else {
assert(CI.capturesVariable() && "Expected captured reference.");
CurBasePointers.push_back(CV);
CurPointers.push_back(CV);
const ReferenceType *PtrTy =
cast<ReferenceType>(RI.getType().getTypePtr());
QualType ElementType = PtrTy->getPointeeType();
CurSizes.push_back(CGF.getTypeSize(ElementType));
// The default map type for a scalar/complex type is 'to' because by
// default the value doesn't have to be retrieved. For an aggregate
// type, the default is 'tofrom'.
CurMapTypes.push_back(ElementType->isAggregateType()
? (OMP_MAP_TO | OMP_MAP_FROM)
: OMP_MAP_TO);
// If we have a capture by reference we may need to add the private
// pointer flag if the base declaration shows in some first-private
// clause.
CurMapTypes.back() =
adjustMapModifiersForPrivateClauses(CI, CurMapTypes.back());
}
// Every default map produces a single argument, so, it is always the
// first one.
CurMapTypes.back() |= OMP_MAP_FIRST_REF;
}
};
enum OpenMPOffloadingReservedDeviceIDs {
/// \brief Device ID if the device was not defined, runtime should get it
/// from environment variables in the spec.
OMP_DEVICEID_UNDEF = -1,
};
} // anonymous namespace
/// \brief Emit the arrays used to pass the captures and map information to the
/// offloading runtime library. If there is no map or capture information,
/// return nullptr by reference.
static void
emitOffloadingArrays(CodeGenFunction &CGF,
MappableExprsHandler::MapBaseValuesArrayTy &BasePointers,
MappableExprsHandler::MapValuesArrayTy &Pointers,
MappableExprsHandler::MapValuesArrayTy &Sizes,
MappableExprsHandler::MapFlagsArrayTy &MapTypes,
CGOpenMPRuntime::TargetDataInfo &Info) {
auto &CGM = CGF.CGM;
auto &Ctx = CGF.getContext();
// Reset the array information.
Info.clearArrayInfo();
Info.NumberOfPtrs = BasePointers.size();
if (Info.NumberOfPtrs) {
// Detect if we have any capture size requiring runtime evaluation of the
// size so that a constant array could be eventually used.
bool hasRuntimeEvaluationCaptureSize = false;
for (auto *S : Sizes)
if (!isa<llvm::Constant>(S)) {
hasRuntimeEvaluationCaptureSize = true;
break;
}
llvm::APInt PointerNumAP(32, Info.NumberOfPtrs, /*isSigned=*/true);
QualType PointerArrayType =
Ctx.getConstantArrayType(Ctx.VoidPtrTy, PointerNumAP, ArrayType::Normal,
/*IndexTypeQuals=*/0);
Info.BasePointersArray =
CGF.CreateMemTemp(PointerArrayType, ".offload_baseptrs").getPointer();
Info.PointersArray =
CGF.CreateMemTemp(PointerArrayType, ".offload_ptrs").getPointer();
// If we don't have any VLA types or other types that require runtime
// evaluation, we can use a constant array for the map sizes, otherwise we
// need to fill up the arrays as we do for the pointers.
if (hasRuntimeEvaluationCaptureSize) {
QualType SizeArrayType = Ctx.getConstantArrayType(
Ctx.getSizeType(), PointerNumAP, ArrayType::Normal,
/*IndexTypeQuals=*/0);
Info.SizesArray =
CGF.CreateMemTemp(SizeArrayType, ".offload_sizes").getPointer();
} else {
// We expect all the sizes to be constant, so we collect them to create
// a constant array.
SmallVector<llvm::Constant *, 16> ConstSizes;
for (auto S : Sizes)
ConstSizes.push_back(cast<llvm::Constant>(S));
auto *SizesArrayInit = llvm::ConstantArray::get(
llvm::ArrayType::get(CGM.SizeTy, ConstSizes.size()), ConstSizes);
auto *SizesArrayGbl = new llvm::GlobalVariable(
CGM.getModule(), SizesArrayInit->getType(),
/*isConstant=*/true, llvm::GlobalValue::PrivateLinkage,
SizesArrayInit, ".offload_sizes");
SizesArrayGbl->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global);
Info.SizesArray = SizesArrayGbl;
}
// The map types are always constant so we don't need to generate code to
// fill arrays. Instead, we create an array constant.
llvm::Constant *MapTypesArrayInit =
llvm::ConstantDataArray::get(CGF.Builder.getContext(), MapTypes);
auto *MapTypesArrayGbl = new llvm::GlobalVariable(
CGM.getModule(), MapTypesArrayInit->getType(),
/*isConstant=*/true, llvm::GlobalValue::PrivateLinkage,
MapTypesArrayInit, ".offload_maptypes");
MapTypesArrayGbl->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global);
Info.MapTypesArray = MapTypesArrayGbl;
for (unsigned i = 0; i < Info.NumberOfPtrs; ++i) {
llvm::Value *BPVal = *BasePointers[i];
if (BPVal->getType()->isPointerTy())
BPVal = CGF.Builder.CreateBitCast(BPVal, CGM.VoidPtrTy);
else {
assert(BPVal->getType()->isIntegerTy() &&
"If not a pointer, the value type must be an integer.");
BPVal = CGF.Builder.CreateIntToPtr(BPVal, CGM.VoidPtrTy);
}
llvm::Value *BP = CGF.Builder.CreateConstInBoundsGEP2_32(
llvm::ArrayType::get(CGM.VoidPtrTy, Info.NumberOfPtrs),
Info.BasePointersArray, 0, i);
Address BPAddr(BP, Ctx.getTypeAlignInChars(Ctx.VoidPtrTy));
CGF.Builder.CreateStore(BPVal, BPAddr);
if (Info.requiresDevicePointerInfo())
if (auto *DevVD = BasePointers[i].getDevicePtrDecl())
Info.CaptureDeviceAddrMap.insert(std::make_pair(DevVD, BPAddr));
llvm::Value *PVal = Pointers[i];
if (PVal->getType()->isPointerTy())
PVal = CGF.Builder.CreateBitCast(PVal, CGM.VoidPtrTy);
else {
assert(PVal->getType()->isIntegerTy() &&
"If not a pointer, the value type must be an integer.");
PVal = CGF.Builder.CreateIntToPtr(PVal, CGM.VoidPtrTy);
}
llvm::Value *P = CGF.Builder.CreateConstInBoundsGEP2_32(
llvm::ArrayType::get(CGM.VoidPtrTy, Info.NumberOfPtrs),
Info.PointersArray, 0, i);
Address PAddr(P, Ctx.getTypeAlignInChars(Ctx.VoidPtrTy));
CGF.Builder.CreateStore(PVal, PAddr);
if (hasRuntimeEvaluationCaptureSize) {
llvm::Value *S = CGF.Builder.CreateConstInBoundsGEP2_32(
llvm::ArrayType::get(CGM.SizeTy, Info.NumberOfPtrs),
Info.SizesArray,
/*Idx0=*/0,
/*Idx1=*/i);
Address SAddr(S, Ctx.getTypeAlignInChars(Ctx.getSizeType()));
CGF.Builder.CreateStore(
CGF.Builder.CreateIntCast(Sizes[i], CGM.SizeTy, /*isSigned=*/true),
SAddr);
}
}
}
}
/// \brief Emit the arguments to be passed to the runtime library based on the
/// arrays of pointers, sizes and map types.
static void emitOffloadingArraysArgument(
CodeGenFunction &CGF, llvm::Value *&BasePointersArrayArg,
llvm::Value *&PointersArrayArg, llvm::Value *&SizesArrayArg,
llvm::Value *&MapTypesArrayArg, CGOpenMPRuntime::TargetDataInfo &Info) {
auto &CGM = CGF.CGM;
if (Info.NumberOfPtrs) {
BasePointersArrayArg = CGF.Builder.CreateConstInBoundsGEP2_32(
llvm::ArrayType::get(CGM.VoidPtrTy, Info.NumberOfPtrs),
Info.BasePointersArray,
/*Idx0=*/0, /*Idx1=*/0);
PointersArrayArg = CGF.Builder.CreateConstInBoundsGEP2_32(
llvm::ArrayType::get(CGM.VoidPtrTy, Info.NumberOfPtrs),
Info.PointersArray,
/*Idx0=*/0,
/*Idx1=*/0);
SizesArrayArg = CGF.Builder.CreateConstInBoundsGEP2_32(
llvm::ArrayType::get(CGM.SizeTy, Info.NumberOfPtrs), Info.SizesArray,
/*Idx0=*/0, /*Idx1=*/0);
MapTypesArrayArg = CGF.Builder.CreateConstInBoundsGEP2_32(
llvm::ArrayType::get(CGM.Int32Ty, Info.NumberOfPtrs),
Info.MapTypesArray,
/*Idx0=*/0,
/*Idx1=*/0);
} else {
BasePointersArrayArg = llvm::ConstantPointerNull::get(CGM.VoidPtrPtrTy);
PointersArrayArg = llvm::ConstantPointerNull::get(CGM.VoidPtrPtrTy);
SizesArrayArg = llvm::ConstantPointerNull::get(CGM.SizeTy->getPointerTo());
MapTypesArrayArg =
llvm::ConstantPointerNull::get(CGM.Int32Ty->getPointerTo());
}
}
void CGOpenMPRuntime::emitTargetCall(CodeGenFunction &CGF,
const OMPExecutableDirective &D,
llvm::Value *OutlinedFn,
llvm::Value *OutlinedFnID,
const Expr *IfCond, const Expr *Device,
ArrayRef<llvm::Value *> CapturedVars) {
if (!CGF.HaveInsertPoint())
return;
assert(OutlinedFn && "Invalid outlined function!");
auto &Ctx = CGF.getContext();
// Fill up the arrays with all the captured variables.
MappableExprsHandler::MapValuesArrayTy KernelArgs;
MappableExprsHandler::MapBaseValuesArrayTy BasePointers;
MappableExprsHandler::MapValuesArrayTy Pointers;
MappableExprsHandler::MapValuesArrayTy Sizes;
MappableExprsHandler::MapFlagsArrayTy MapTypes;
MappableExprsHandler::MapBaseValuesArrayTy CurBasePointers;
MappableExprsHandler::MapValuesArrayTy CurPointers;
MappableExprsHandler::MapValuesArrayTy CurSizes;
MappableExprsHandler::MapFlagsArrayTy CurMapTypes;
// Get mappable expression information.
MappableExprsHandler MEHandler(D, CGF);
const CapturedStmt &CS = *cast<CapturedStmt>(D.getAssociatedStmt());
auto RI = CS.getCapturedRecordDecl()->field_begin();
auto CV = CapturedVars.begin();
for (CapturedStmt::const_capture_iterator CI = CS.capture_begin(),
CE = CS.capture_end();
CI != CE; ++CI, ++RI, ++CV) {
StringRef Name;
QualType Ty;
CurBasePointers.clear();
CurPointers.clear();
CurSizes.clear();
CurMapTypes.clear();
// VLA sizes are passed to the outlined region by copy and do not have map
// information associated.
if (CI->capturesVariableArrayType()) {
CurBasePointers.push_back(*CV);
CurPointers.push_back(*CV);
CurSizes.push_back(CGF.getTypeSize(RI->getType()));
// Copy to the device as an argument. No need to retrieve it.
CurMapTypes.push_back(MappableExprsHandler::OMP_MAP_PRIVATE_VAL |
MappableExprsHandler::OMP_MAP_FIRST_REF);
} else {
// If we have any information in the map clause, we use it, otherwise we
// just do a default mapping.
MEHandler.generateInfoForCapture(CI, *CV, CurBasePointers, CurPointers,
CurSizes, CurMapTypes);
if (CurBasePointers.empty())
MEHandler.generateDefaultMapInfo(*CI, **RI, *CV, CurBasePointers,
CurPointers, CurSizes, CurMapTypes);
}
// We expect to have at least an element of information for this capture.
assert(!CurBasePointers.empty() && "Non-existing map pointer for capture!");
assert(CurBasePointers.size() == CurPointers.size() &&
CurBasePointers.size() == CurSizes.size() &&
CurBasePointers.size() == CurMapTypes.size() &&
"Inconsistent map information sizes!");
// The kernel args are always the first elements of the base pointers
// associated with a capture.
KernelArgs.push_back(*CurBasePointers.front());
// We need to append the results of this capture to what we already have.
BasePointers.append(CurBasePointers.begin(), CurBasePointers.end());
Pointers.append(CurPointers.begin(), CurPointers.end());
Sizes.append(CurSizes.begin(), CurSizes.end());
MapTypes.append(CurMapTypes.begin(), CurMapTypes.end());
}
// Keep track on whether the host function has to be executed.
auto OffloadErrorQType =
Ctx.getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/true);
auto OffloadError = CGF.MakeAddrLValue(
CGF.CreateMemTemp(OffloadErrorQType, ".run_host_version"),
OffloadErrorQType);
CGF.EmitStoreOfScalar(llvm::Constant::getNullValue(CGM.Int32Ty),
OffloadError);
// Fill up the pointer arrays and transfer execution to the device.
auto &&ThenGen = [&Ctx, &BasePointers, &Pointers, &Sizes, &MapTypes, Device,
OutlinedFnID, OffloadError, OffloadErrorQType,
&D](CodeGenFunction &CGF, PrePostActionTy &) {
auto &RT = CGF.CGM.getOpenMPRuntime();
// Emit the offloading arrays.
TargetDataInfo Info;
emitOffloadingArrays(CGF, BasePointers, Pointers, Sizes, MapTypes, Info);
emitOffloadingArraysArgument(CGF, Info.BasePointersArray,
Info.PointersArray, Info.SizesArray,
Info.MapTypesArray, Info);
// On top of the arrays that were filled up, the target offloading call
// takes as arguments the device id as well as the host pointer. The host
// pointer is used by the runtime library to identify the current target
// region, so it only has to be unique and not necessarily point to
// anything. It could be the pointer to the outlined function that
// implements the target region, but we aren't using that so that the
// compiler doesn't need to keep that, and could therefore inline the host
// function if proven worthwhile during optimization.
// From this point on, we need to have an ID of the target region defined.
assert(OutlinedFnID && "Invalid outlined function ID!");
// Emit device ID if any.
llvm::Value *DeviceID;
if (Device)
DeviceID = CGF.Builder.CreateIntCast(CGF.EmitScalarExpr(Device),
CGF.Int32Ty, /*isSigned=*/true);
else
DeviceID = CGF.Builder.getInt32(OMP_DEVICEID_UNDEF);
// Emit the number of elements in the offloading arrays.
llvm::Value *PointerNum = CGF.Builder.getInt32(BasePointers.size());
// Return value of the runtime offloading call.
llvm::Value *Return;
auto *NumTeams = emitNumTeamsClauseForTargetDirective(RT, CGF, D);
auto *ThreadLimit = emitThreadLimitClauseForTargetDirective(RT, CGF, D);
// If we have NumTeams defined this means that we have an enclosed teams
// region. Therefore we also expect to have ThreadLimit defined. These two
// values should be defined in the presence of a teams directive, regardless
// of having any clauses associated. If the user is using teams but no
// clauses, these two values will be the default that should be passed to
// the runtime library - a 32-bit integer with the value zero.
if (NumTeams) {
assert(ThreadLimit && "Thread limit expression should be available along "
"with number of teams.");
llvm::Value *OffloadingArgs[] = {
DeviceID, OutlinedFnID,
PointerNum, Info.BasePointersArray,
Info.PointersArray, Info.SizesArray,
Info.MapTypesArray, NumTeams,
ThreadLimit};
Return = CGF.EmitRuntimeCall(
RT.createRuntimeFunction(OMPRTL__tgt_target_teams), OffloadingArgs);
} else {
llvm::Value *OffloadingArgs[] = {
DeviceID, OutlinedFnID,
PointerNum, Info.BasePointersArray,
Info.PointersArray, Info.SizesArray,
Info.MapTypesArray};
Return = CGF.EmitRuntimeCall(RT.createRuntimeFunction(OMPRTL__tgt_target),
OffloadingArgs);
}
CGF.EmitStoreOfScalar(Return, OffloadError);
};
// Notify that the host version must be executed.
auto &&ElseGen = [OffloadError](CodeGenFunction &CGF, PrePostActionTy &) {
CGF.EmitStoreOfScalar(llvm::ConstantInt::get(CGF.Int32Ty, /*V=*/-1u),
OffloadError);
};
// If we have a target function ID it means that we need to support
// offloading, otherwise, just execute on the host. We need to execute on host
// regardless of the conditional in the if clause if, e.g., the user do not
// specify target triples.
if (OutlinedFnID) {
if (IfCond)
emitOMPIfClause(CGF, IfCond, ThenGen, ElseGen);
else {
RegionCodeGenTy ThenRCG(ThenGen);
ThenRCG(CGF);
}
} else {
RegionCodeGenTy ElseRCG(ElseGen);
ElseRCG(CGF);
}
// Check the error code and execute the host version if required.
auto OffloadFailedBlock = CGF.createBasicBlock("omp_offload.failed");
auto OffloadContBlock = CGF.createBasicBlock("omp_offload.cont");
auto OffloadErrorVal = CGF.EmitLoadOfScalar(OffloadError, SourceLocation());
auto Failed = CGF.Builder.CreateIsNotNull(OffloadErrorVal);
CGF.Builder.CreateCondBr(Failed, OffloadFailedBlock, OffloadContBlock);
CGF.EmitBlock(OffloadFailedBlock);
CGF.Builder.CreateCall(OutlinedFn, KernelArgs);
CGF.EmitBranch(OffloadContBlock);
CGF.EmitBlock(OffloadContBlock, /*IsFinished=*/true);
}
void CGOpenMPRuntime::scanForTargetRegionsFunctions(const Stmt *S,
StringRef ParentName) {
if (!S)
return;
// If we find a OMP target directive, codegen the outline function and
// register the result.
// FIXME: Add other directives with target when they become supported.
bool isTargetDirective = isa<OMPTargetDirective>(S);
if (isTargetDirective) {
auto *E = cast<OMPExecutableDirective>(S);
unsigned DeviceID;
unsigned FileID;
unsigned Line;
getTargetEntryUniqueInfo(CGM.getContext(), E->getLocStart(), DeviceID,
FileID, Line);
// Is this a target region that should not be emitted as an entry point? If
// so just signal we are done with this target region.
if (!OffloadEntriesInfoManager.hasTargetRegionEntryInfo(DeviceID, FileID,
ParentName, Line))
return;
llvm::Function *Fn;
llvm::Constant *Addr;
std::tie(Fn, Addr) =
CodeGenFunction::EmitOMPTargetDirectiveOutlinedFunction(
CGM, cast<OMPTargetDirective>(*E), ParentName,
/*isOffloadEntry=*/true);
assert(Fn && Addr && "Target region emission failed.");
return;
}
if (const OMPExecutableDirective *E = dyn_cast<OMPExecutableDirective>(S)) {
if (!E->hasAssociatedStmt())
return;
scanForTargetRegionsFunctions(
cast<CapturedStmt>(E->getAssociatedStmt())->getCapturedStmt(),
ParentName);
return;
}
// If this is a lambda function, look into its body.
if (auto *L = dyn_cast<LambdaExpr>(S))
S = L->getBody();
// Keep looking for target regions recursively.
for (auto *II : S->children())
scanForTargetRegionsFunctions(II, ParentName);
}
bool CGOpenMPRuntime::emitTargetFunctions(GlobalDecl GD) {
auto &FD = *cast<FunctionDecl>(GD.getDecl());
// If emitting code for the host, we do not process FD here. Instead we do
// the normal code generation.
if (!CGM.getLangOpts().OpenMPIsDevice)
return false;
// Try to detect target regions in the function.
scanForTargetRegionsFunctions(FD.getBody(), CGM.getMangledName(GD));
// We should not emit any function other that the ones created during the
// scanning. Therefore, we signal that this function is completely dealt
// with.
return true;
}
bool CGOpenMPRuntime::emitTargetGlobalVariable(GlobalDecl GD) {
if (!CGM.getLangOpts().OpenMPIsDevice)
return false;
// Check if there are Ctors/Dtors in this declaration and look for target
// regions in it. We use the complete variant to produce the kernel name
// mangling.
QualType RDTy = cast<VarDecl>(GD.getDecl())->getType();
if (auto *RD = RDTy->getBaseElementTypeUnsafe()->getAsCXXRecordDecl()) {
for (auto *Ctor : RD->ctors()) {
StringRef ParentName =
CGM.getMangledName(GlobalDecl(Ctor, Ctor_Complete));
scanForTargetRegionsFunctions(Ctor->getBody(), ParentName);
}
auto *Dtor = RD->getDestructor();
if (Dtor) {
StringRef ParentName =
CGM.getMangledName(GlobalDecl(Dtor, Dtor_Complete));
scanForTargetRegionsFunctions(Dtor->getBody(), ParentName);
}
}
// If we are in target mode we do not emit any global (declare target is not
// implemented yet). Therefore we signal that GD was processed in this case.
return true;
}
bool CGOpenMPRuntime::emitTargetGlobal(GlobalDecl GD) {
auto *VD = GD.getDecl();
if (isa<FunctionDecl>(VD))
return emitTargetFunctions(GD);
return emitTargetGlobalVariable(GD);
}
llvm::Function *CGOpenMPRuntime::emitRegistrationFunction() {
// If we have offloading in the current module, we need to emit the entries
// now and register the offloading descriptor.
createOffloadEntriesAndInfoMetadata();
// Create and register the offloading binary descriptors. This is the main
// entity that captures all the information about offloading in the current
// compilation unit.
return createOffloadingBinaryDescriptorRegistration();
}
void CGOpenMPRuntime::emitTeamsCall(CodeGenFunction &CGF,
const OMPExecutableDirective &D,
SourceLocation Loc,
llvm::Value *OutlinedFn,
ArrayRef<llvm::Value *> CapturedVars) {
if (!CGF.HaveInsertPoint())
return;
auto *RTLoc = emitUpdateLocation(CGF, Loc);
CodeGenFunction::RunCleanupsScope Scope(CGF);
// Build call __kmpc_fork_teams(loc, n, microtask, var1, .., varn);
llvm::Value *Args[] = {
RTLoc,
CGF.Builder.getInt32(CapturedVars.size()), // Number of captured vars
CGF.Builder.CreateBitCast(OutlinedFn, getKmpc_MicroPointerTy())};
llvm::SmallVector<llvm::Value *, 16> RealArgs;
RealArgs.append(std::begin(Args), std::end(Args));
RealArgs.append(CapturedVars.begin(), CapturedVars.end());
auto RTLFn = createRuntimeFunction(OMPRTL__kmpc_fork_teams);
CGF.EmitRuntimeCall(RTLFn, RealArgs);
}
void CGOpenMPRuntime::emitNumTeamsClause(CodeGenFunction &CGF,
const Expr *NumTeams,
const Expr *ThreadLimit,
SourceLocation Loc) {
if (!CGF.HaveInsertPoint())
return;
auto *RTLoc = emitUpdateLocation(CGF, Loc);
llvm::Value *NumTeamsVal =
(NumTeams)
? CGF.Builder.CreateIntCast(CGF.EmitScalarExpr(NumTeams),
CGF.CGM.Int32Ty, /* isSigned = */ true)
: CGF.Builder.getInt32(0);
llvm::Value *ThreadLimitVal =
(ThreadLimit)
? CGF.Builder.CreateIntCast(CGF.EmitScalarExpr(ThreadLimit),
CGF.CGM.Int32Ty, /* isSigned = */ true)
: CGF.Builder.getInt32(0);
// Build call __kmpc_push_num_teamss(&loc, global_tid, num_teams, thread_limit)
llvm::Value *PushNumTeamsArgs[] = {RTLoc, getThreadID(CGF, Loc), NumTeamsVal,
ThreadLimitVal};
CGF.EmitRuntimeCall(createRuntimeFunction(OMPRTL__kmpc_push_num_teams),
PushNumTeamsArgs);
}
void CGOpenMPRuntime::emitTargetDataCalls(
CodeGenFunction &CGF, const OMPExecutableDirective &D, const Expr *IfCond,
const Expr *Device, const RegionCodeGenTy &CodeGen, TargetDataInfo &Info) {
if (!CGF.HaveInsertPoint())
return;
// Action used to replace the default codegen action and turn privatization
// off.
PrePostActionTy NoPrivAction;
// Generate the code for the opening of the data environment. Capture all the
// arguments of the runtime call by reference because they are used in the
// closing of the region.
auto &&BeginThenGen = [&D, &CGF, Device, &Info, &CodeGen, &NoPrivAction](
CodeGenFunction &CGF, PrePostActionTy &) {
// Fill up the arrays with all the mapped variables.
MappableExprsHandler::MapBaseValuesArrayTy BasePointers;
MappableExprsHandler::MapValuesArrayTy Pointers;
MappableExprsHandler::MapValuesArrayTy Sizes;
MappableExprsHandler::MapFlagsArrayTy MapTypes;
// Get map clause information.
MappableExprsHandler MCHandler(D, CGF);
MCHandler.generateAllInfo(BasePointers, Pointers, Sizes, MapTypes);
// Fill up the arrays and create the arguments.
emitOffloadingArrays(CGF, BasePointers, Pointers, Sizes, MapTypes, Info);
llvm::Value *BasePointersArrayArg = nullptr;
llvm::Value *PointersArrayArg = nullptr;
llvm::Value *SizesArrayArg = nullptr;
llvm::Value *MapTypesArrayArg = nullptr;
emitOffloadingArraysArgument(CGF, BasePointersArrayArg, PointersArrayArg,
SizesArrayArg, MapTypesArrayArg, Info);
// Emit device ID if any.
llvm::Value *DeviceID = nullptr;
if (Device)
DeviceID = CGF.Builder.CreateIntCast(CGF.EmitScalarExpr(Device),
CGF.Int32Ty, /*isSigned=*/true);
else
DeviceID = CGF.Builder.getInt32(OMP_DEVICEID_UNDEF);
// Emit the number of elements in the offloading arrays.
auto *PointerNum = CGF.Builder.getInt32(Info.NumberOfPtrs);
llvm::Value *OffloadingArgs[] = {
DeviceID, PointerNum, BasePointersArrayArg,
PointersArrayArg, SizesArrayArg, MapTypesArrayArg};
auto &RT = CGF.CGM.getOpenMPRuntime();
CGF.EmitRuntimeCall(RT.createRuntimeFunction(OMPRTL__tgt_target_data_begin),
OffloadingArgs);
// If device pointer privatization is required, emit the body of the region
// here. It will have to be duplicated: with and without privatization.
if (!Info.CaptureDeviceAddrMap.empty())
CodeGen(CGF);
};
// Generate code for the closing of the data region.
auto &&EndThenGen = [&CGF, Device, &Info](CodeGenFunction &CGF,
PrePostActionTy &) {
assert(Info.isValid() && "Invalid data environment closing arguments.");
llvm::Value *BasePointersArrayArg = nullptr;
llvm::Value *PointersArrayArg = nullptr;
llvm::Value *SizesArrayArg = nullptr;
llvm::Value *MapTypesArrayArg = nullptr;
emitOffloadingArraysArgument(CGF, BasePointersArrayArg, PointersArrayArg,
SizesArrayArg, MapTypesArrayArg, Info);
// Emit device ID if any.
llvm::Value *DeviceID = nullptr;
if (Device)
DeviceID = CGF.Builder.CreateIntCast(CGF.EmitScalarExpr(Device),
CGF.Int32Ty, /*isSigned=*/true);
else
DeviceID = CGF.Builder.getInt32(OMP_DEVICEID_UNDEF);
// Emit the number of elements in the offloading arrays.
auto *PointerNum = CGF.Builder.getInt32(Info.NumberOfPtrs);
llvm::Value *OffloadingArgs[] = {
DeviceID, PointerNum, BasePointersArrayArg,
PointersArrayArg, SizesArrayArg, MapTypesArrayArg};
auto &RT = CGF.CGM.getOpenMPRuntime();
CGF.EmitRuntimeCall(RT.createRuntimeFunction(OMPRTL__tgt_target_data_end),
OffloadingArgs);
};
// If we need device pointer privatization, we need to emit the body of the
// region with no privatization in the 'else' branch of the conditional.
// Otherwise, we don't have to do anything.
auto &&BeginElseGen = [&Info, &CodeGen, &NoPrivAction](CodeGenFunction &CGF,
PrePostActionTy &) {
if (!Info.CaptureDeviceAddrMap.empty()) {
CodeGen.setAction(NoPrivAction);
CodeGen(CGF);
}
};
// We don't have to do anything to close the region if the if clause evaluates
// to false.
auto &&EndElseGen = [](CodeGenFunction &CGF, PrePostActionTy &) {};
if (IfCond) {
emitOMPIfClause(CGF, IfCond, BeginThenGen, BeginElseGen);
} else {
RegionCodeGenTy RCG(BeginThenGen);
RCG(CGF);
}
// If we don't require privatization of device pointers, we emit the body in
// between the runtime calls. This avoids duplicating the body code.
if (Info.CaptureDeviceAddrMap.empty()) {
CodeGen.setAction(NoPrivAction);
CodeGen(CGF);
}
if (IfCond) {
emitOMPIfClause(CGF, IfCond, EndThenGen, EndElseGen);
} else {
RegionCodeGenTy RCG(EndThenGen);
RCG(CGF);
}
}
void CGOpenMPRuntime::emitTargetDataStandAloneCall(
CodeGenFunction &CGF, const OMPExecutableDirective &D, const Expr *IfCond,
const Expr *Device) {
if (!CGF.HaveInsertPoint())
return;
assert((isa<OMPTargetEnterDataDirective>(D) ||
isa<OMPTargetExitDataDirective>(D) ||
isa<OMPTargetUpdateDirective>(D)) &&
"Expecting either target enter, exit data, or update directives.");
// Generate the code for the opening of the data environment.
auto &&ThenGen = [&D, &CGF, Device](CodeGenFunction &CGF, PrePostActionTy &) {
// Fill up the arrays with all the mapped variables.
MappableExprsHandler::MapBaseValuesArrayTy BasePointers;
MappableExprsHandler::MapValuesArrayTy Pointers;
MappableExprsHandler::MapValuesArrayTy Sizes;
MappableExprsHandler::MapFlagsArrayTy MapTypes;
// Get map clause information.
MappableExprsHandler MEHandler(D, CGF);
MEHandler.generateAllInfo(BasePointers, Pointers, Sizes, MapTypes);
// Fill up the arrays and create the arguments.
TargetDataInfo Info;
emitOffloadingArrays(CGF, BasePointers, Pointers, Sizes, MapTypes, Info);
emitOffloadingArraysArgument(CGF, Info.BasePointersArray,
Info.PointersArray, Info.SizesArray,
Info.MapTypesArray, Info);
// Emit device ID if any.
llvm::Value *DeviceID = nullptr;
if (Device)
DeviceID = CGF.Builder.CreateIntCast(CGF.EmitScalarExpr(Device),
CGF.Int32Ty, /*isSigned=*/true);
else
DeviceID = CGF.Builder.getInt32(OMP_DEVICEID_UNDEF);
// Emit the number of elements in the offloading arrays.
auto *PointerNum = CGF.Builder.getInt32(BasePointers.size());
llvm::Value *OffloadingArgs[] = {
DeviceID, PointerNum, Info.BasePointersArray,
Info.PointersArray, Info.SizesArray, Info.MapTypesArray};
auto &RT = CGF.CGM.getOpenMPRuntime();
// Select the right runtime function call for each expected standalone
// directive.
OpenMPRTLFunction RTLFn;
switch (D.getDirectiveKind()) {
default:
llvm_unreachable("Unexpected standalone target data directive.");
break;
case OMPD_target_enter_data:
RTLFn = OMPRTL__tgt_target_data_begin;
break;
case OMPD_target_exit_data:
RTLFn = OMPRTL__tgt_target_data_end;
break;
case OMPD_target_update:
RTLFn = OMPRTL__tgt_target_data_update;
break;
}
CGF.EmitRuntimeCall(RT.createRuntimeFunction(RTLFn), OffloadingArgs);
};
// In the event we get an if clause, we don't have to take any action on the
// else side.
auto &&ElseGen = [](CodeGenFunction &CGF, PrePostActionTy &) {};
if (IfCond) {
emitOMPIfClause(CGF, IfCond, ThenGen, ElseGen);
} else {
RegionCodeGenTy ThenGenRCG(ThenGen);
ThenGenRCG(CGF);
}
}
namespace {
/// Kind of parameter in a function with 'declare simd' directive.
enum ParamKindTy { LinearWithVarStride, Linear, Uniform, Vector };
/// Attribute set of the parameter.
struct ParamAttrTy {
ParamKindTy Kind = Vector;
llvm::APSInt StrideOrArg;
llvm::APSInt Alignment;
};
} // namespace
static unsigned evaluateCDTSize(const FunctionDecl *FD,
ArrayRef<ParamAttrTy> ParamAttrs) {
// Every vector variant of a SIMD-enabled function has a vector length (VLEN).
// If OpenMP clause "simdlen" is used, the VLEN is the value of the argument
// of that clause. The VLEN value must be power of 2.
// In other case the notion of the function`s "characteristic data type" (CDT)
// is used to compute the vector length.
// CDT is defined in the following order:
// a) For non-void function, the CDT is the return type.
// b) If the function has any non-uniform, non-linear parameters, then the
// CDT is the type of the first such parameter.
// c) If the CDT determined by a) or b) above is struct, union, or class
// type which is pass-by-value (except for the type that maps to the
// built-in complex data type), the characteristic data type is int.
// d) If none of the above three cases is applicable, the CDT is int.
// The VLEN is then determined based on the CDT and the size of vector
// register of that ISA for which current vector version is generated. The
// VLEN is computed using the formula below:
// VLEN = sizeof(vector_register) / sizeof(CDT),
// where vector register size specified in section 3.2.1 Registers and the
// Stack Frame of original AMD64 ABI document.
QualType RetType = FD->getReturnType();
if (RetType.isNull())
return 0;
ASTContext &C = FD->getASTContext();
QualType CDT;
if (!RetType.isNull() && !RetType->isVoidType())
CDT = RetType;
else {
unsigned Offset = 0;
if (auto *MD = dyn_cast<CXXMethodDecl>(FD)) {
if (ParamAttrs[Offset].Kind == Vector)
CDT = C.getPointerType(C.getRecordType(MD->getParent()));
++Offset;
}
if (CDT.isNull()) {
for (unsigned I = 0, E = FD->getNumParams(); I < E; ++I) {
if (ParamAttrs[I + Offset].Kind == Vector) {
CDT = FD->getParamDecl(I)->getType();
break;
}
}
}
}
if (CDT.isNull())
CDT = C.IntTy;
CDT = CDT->getCanonicalTypeUnqualified();
if (CDT->isRecordType() || CDT->isUnionType())
CDT = C.IntTy;
return C.getTypeSize(CDT);
}
static void
emitX86DeclareSimdFunction(const FunctionDecl *FD, llvm::Function *Fn,
const llvm::APSInt &VLENVal,
ArrayRef<ParamAttrTy> ParamAttrs,
OMPDeclareSimdDeclAttr::BranchStateTy State) {
struct ISADataTy {
char ISA;
unsigned VecRegSize;
};
ISADataTy ISAData[] = {
{
'b', 128
}, // SSE
{
'c', 256
}, // AVX
{
'd', 256
}, // AVX2
{
'e', 512
}, // AVX512
};
llvm::SmallVector<char, 2> Masked;
switch (State) {
case OMPDeclareSimdDeclAttr::BS_Undefined:
Masked.push_back('N');
Masked.push_back('M');
break;
case OMPDeclareSimdDeclAttr::BS_Notinbranch:
Masked.push_back('N');
break;
case OMPDeclareSimdDeclAttr::BS_Inbranch:
Masked.push_back('M');
break;
}
for (auto Mask : Masked) {
for (auto &Data : ISAData) {
SmallString<256> Buffer;
llvm::raw_svector_ostream Out(Buffer);
Out << "_ZGV" << Data.ISA << Mask;
if (!VLENVal) {
Out << llvm::APSInt::getUnsigned(Data.VecRegSize /
evaluateCDTSize(FD, ParamAttrs));
} else
Out << VLENVal;
for (auto &ParamAttr : ParamAttrs) {
switch (ParamAttr.Kind){
case LinearWithVarStride:
Out << 's' << ParamAttr.StrideOrArg;
break;
case Linear:
Out << 'l';
if (!!ParamAttr.StrideOrArg)
Out << ParamAttr.StrideOrArg;
break;
case Uniform:
Out << 'u';
break;
case Vector:
Out << 'v';
break;
}
if (!!ParamAttr.Alignment)
Out << 'a' << ParamAttr.Alignment;
}
Out << '_' << Fn->getName();
Fn->addFnAttr(Out.str());
}
}
}
void CGOpenMPRuntime::emitDeclareSimdFunction(const FunctionDecl *FD,
llvm::Function *Fn) {
ASTContext &C = CGM.getContext();
FD = FD->getCanonicalDecl();
// Map params to their positions in function decl.
llvm::DenseMap<const Decl *, unsigned> ParamPositions;
if (isa<CXXMethodDecl>(FD))
ParamPositions.insert({FD, 0});
unsigned ParamPos = ParamPositions.size();
for (auto *P : FD->parameters()) {
ParamPositions.insert({P->getCanonicalDecl(), ParamPos});
++ParamPos;
}
for (auto *Attr : FD->specific_attrs<OMPDeclareSimdDeclAttr>()) {
llvm::SmallVector<ParamAttrTy, 8> ParamAttrs(ParamPositions.size());
// Mark uniform parameters.
for (auto *E : Attr->uniforms()) {
E = E->IgnoreParenImpCasts();
unsigned Pos;
if (isa<CXXThisExpr>(E))
Pos = ParamPositions[FD];
else {
auto *PVD = cast<ParmVarDecl>(cast<DeclRefExpr>(E)->getDecl())
->getCanonicalDecl();
Pos = ParamPositions[PVD];
}
ParamAttrs[Pos].Kind = Uniform;
}
// Get alignment info.
auto NI = Attr->alignments_begin();
for (auto *E : Attr->aligneds()) {
E = E->IgnoreParenImpCasts();
unsigned Pos;
QualType ParmTy;
if (isa<CXXThisExpr>(E)) {
Pos = ParamPositions[FD];
ParmTy = E->getType();
} else {
auto *PVD = cast<ParmVarDecl>(cast<DeclRefExpr>(E)->getDecl())
->getCanonicalDecl();
Pos = ParamPositions[PVD];
ParmTy = PVD->getType();
}
ParamAttrs[Pos].Alignment =
(*NI) ? (*NI)->EvaluateKnownConstInt(C)
: llvm::APSInt::getUnsigned(
C.toCharUnitsFromBits(C.getOpenMPDefaultSimdAlign(ParmTy))
.getQuantity());
++NI;
}
// Mark linear parameters.
auto SI = Attr->steps_begin();
auto MI = Attr->modifiers_begin();
for (auto *E : Attr->linears()) {
E = E->IgnoreParenImpCasts();
unsigned Pos;
if (isa<CXXThisExpr>(E))
Pos = ParamPositions[FD];
else {
auto *PVD = cast<ParmVarDecl>(cast<DeclRefExpr>(E)->getDecl())
->getCanonicalDecl();
Pos = ParamPositions[PVD];
}
auto &ParamAttr = ParamAttrs[Pos];
ParamAttr.Kind = Linear;
if (*SI) {
if (!(*SI)->EvaluateAsInt(ParamAttr.StrideOrArg, C,
Expr::SE_AllowSideEffects)) {
if (auto *DRE = cast<DeclRefExpr>((*SI)->IgnoreParenImpCasts())) {
if (auto *StridePVD = cast<ParmVarDecl>(DRE->getDecl())) {
ParamAttr.Kind = LinearWithVarStride;
ParamAttr.StrideOrArg = llvm::APSInt::getUnsigned(
ParamPositions[StridePVD->getCanonicalDecl()]);
}
}
}
}
++SI;
++MI;
}
llvm::APSInt VLENVal;
if (const Expr *VLEN = Attr->getSimdlen())
VLENVal = VLEN->EvaluateKnownConstInt(C);
OMPDeclareSimdDeclAttr::BranchStateTy State = Attr->getBranchState();
if (CGM.getTriple().getArch() == llvm::Triple::x86 ||
CGM.getTriple().getArch() == llvm::Triple::x86_64)
emitX86DeclareSimdFunction(FD, Fn, VLENVal, ParamAttrs, State);
}
}
namespace {
/// Cleanup action for doacross support.
class DoacrossCleanupTy final : public EHScopeStack::Cleanup {
public:
static const int DoacrossFinArgs = 2;
private:
llvm::Value *RTLFn;
llvm::Value *Args[DoacrossFinArgs];
public:
DoacrossCleanupTy(llvm::Value *RTLFn, ArrayRef<llvm::Value *> CallArgs)
: RTLFn(RTLFn) {
assert(CallArgs.size() == DoacrossFinArgs);
std::copy(CallArgs.begin(), CallArgs.end(), std::begin(Args));
}
void Emit(CodeGenFunction &CGF, Flags /*flags*/) override {
if (!CGF.HaveInsertPoint())
return;
CGF.EmitRuntimeCall(RTLFn, Args);
}
};
} // namespace
void CGOpenMPRuntime::emitDoacrossInit(CodeGenFunction &CGF,
const OMPLoopDirective &D) {
if (!CGF.HaveInsertPoint())
return;
ASTContext &C = CGM.getContext();
QualType Int64Ty = C.getIntTypeForBitwidth(/*DestWidth=*/64, /*Signed=*/true);
RecordDecl *RD;
if (KmpDimTy.isNull()) {
// Build struct kmp_dim { // loop bounds info casted to kmp_int64
// kmp_int64 lo; // lower
// kmp_int64 up; // upper
// kmp_int64 st; // stride
// };
RD = C.buildImplicitRecord("kmp_dim");
RD->startDefinition();
addFieldToRecordDecl(C, RD, Int64Ty);
addFieldToRecordDecl(C, RD, Int64Ty);
addFieldToRecordDecl(C, RD, Int64Ty);
RD->completeDefinition();
KmpDimTy = C.getRecordType(RD);
} else
RD = cast<RecordDecl>(KmpDimTy->getAsTagDecl());
Address DimsAddr = CGF.CreateMemTemp(KmpDimTy, "dims");
CGF.EmitNullInitialization(DimsAddr, KmpDimTy);
enum { LowerFD = 0, UpperFD, StrideFD };
// Fill dims with data.
LValue DimsLVal = CGF.MakeAddrLValue(DimsAddr, KmpDimTy);
// dims.upper = num_iterations;
LValue UpperLVal =
CGF.EmitLValueForField(DimsLVal, *std::next(RD->field_begin(), UpperFD));
llvm::Value *NumIterVal = CGF.EmitScalarConversion(
CGF.EmitScalarExpr(D.getNumIterations()), D.getNumIterations()->getType(),
Int64Ty, D.getNumIterations()->getExprLoc());
CGF.EmitStoreOfScalar(NumIterVal, UpperLVal);
// dims.stride = 1;
LValue StrideLVal =
CGF.EmitLValueForField(DimsLVal, *std::next(RD->field_begin(), StrideFD));
CGF.EmitStoreOfScalar(llvm::ConstantInt::getSigned(CGM.Int64Ty, /*V=*/1),
StrideLVal);
// Build call void __kmpc_doacross_init(ident_t *loc, kmp_int32 gtid,
// kmp_int32 num_dims, struct kmp_dim * dims);
llvm::Value *Args[] = {emitUpdateLocation(CGF, D.getLocStart()),
getThreadID(CGF, D.getLocStart()),
llvm::ConstantInt::getSigned(CGM.Int32Ty, 1),
CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
DimsAddr.getPointer(), CGM.VoidPtrTy)};
llvm::Value *RTLFn = createRuntimeFunction(OMPRTL__kmpc_doacross_init);
CGF.EmitRuntimeCall(RTLFn, Args);
llvm::Value *FiniArgs[DoacrossCleanupTy::DoacrossFinArgs] = {
emitUpdateLocation(CGF, D.getLocEnd()), getThreadID(CGF, D.getLocEnd())};
llvm::Value *FiniRTLFn = createRuntimeFunction(OMPRTL__kmpc_doacross_fini);
CGF.EHStack.pushCleanup<DoacrossCleanupTy>(NormalAndEHCleanup, FiniRTLFn,
llvm::makeArrayRef(FiniArgs));
}
void CGOpenMPRuntime::emitDoacrossOrdered(CodeGenFunction &CGF,
const OMPDependClause *C) {
QualType Int64Ty =
CGM.getContext().getIntTypeForBitwidth(/*DestWidth=*/64, /*Signed=*/1);
const Expr *CounterVal = C->getCounterValue();
assert(CounterVal);
llvm::Value *CntVal = CGF.EmitScalarConversion(CGF.EmitScalarExpr(CounterVal),
CounterVal->getType(), Int64Ty,
CounterVal->getExprLoc());
Address CntAddr = CGF.CreateMemTemp(Int64Ty, ".cnt.addr");
CGF.EmitStoreOfScalar(CntVal, CntAddr, /*Volatile=*/false, Int64Ty);
llvm::Value *Args[] = {emitUpdateLocation(CGF, C->getLocStart()),
getThreadID(CGF, C->getLocStart()),
CntAddr.getPointer()};
llvm::Value *RTLFn;
if (C->getDependencyKind() == OMPC_DEPEND_source)
RTLFn = createRuntimeFunction(OMPRTL__kmpc_doacross_post);
else {
assert(C->getDependencyKind() == OMPC_DEPEND_sink);
RTLFn = createRuntimeFunction(OMPRTL__kmpc_doacross_wait);
}
CGF.EmitRuntimeCall(RTLFn, Args);
}
Index: projects/clang400-import/contrib/llvm/tools/clang/lib/Frontend/InitPreprocessor.cpp
===================================================================
--- projects/clang400-import/contrib/llvm/tools/clang/lib/Frontend/InitPreprocessor.cpp (revision 314268)
+++ projects/clang400-import/contrib/llvm/tools/clang/lib/Frontend/InitPreprocessor.cpp (revision 314269)
@@ -1,1100 +1,1102 @@
//===--- InitPreprocessor.cpp - PP initialization code. ---------*- C++ -*-===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
// This file implements the clang::InitializePreprocessor function.
//
//===----------------------------------------------------------------------===//
#include "clang/Basic/FileManager.h"
#include "clang/Basic/MacroBuilder.h"
#include "clang/Basic/SourceManager.h"
#include "clang/Basic/TargetInfo.h"
#include "clang/Basic/Version.h"
#include "clang/Frontend/FrontendDiagnostic.h"
#include "clang/Frontend/FrontendOptions.h"
#include "clang/Frontend/Utils.h"
#include "clang/Lex/HeaderSearch.h"
#include "clang/Lex/PTHManager.h"
#include "clang/Lex/Preprocessor.h"
#include "clang/Lex/PreprocessorOptions.h"
#include "clang/Serialization/ASTReader.h"
#include "llvm/ADT/APFloat.h"
using namespace clang;
static bool MacroBodyEndsInBackslash(StringRef MacroBody) {
while (!MacroBody.empty() && isWhitespace(MacroBody.back()))
MacroBody = MacroBody.drop_back();
return !MacroBody.empty() && MacroBody.back() == '\\';
}
// Append a #define line to Buf for Macro. Macro should be of the form XXX,
// in which case we emit "#define XXX 1" or "XXX=Y z W" in which case we emit
// "#define XXX Y z W". To get a #define with no value, use "XXX=".
static void DefineBuiltinMacro(MacroBuilder &Builder, StringRef Macro,
DiagnosticsEngine &Diags) {
std::pair<StringRef, StringRef> MacroPair = Macro.split('=');
StringRef MacroName = MacroPair.first;
StringRef MacroBody = MacroPair.second;
if (MacroName.size() != Macro.size()) {
// Per GCC -D semantics, the macro ends at \n if it exists.
StringRef::size_type End = MacroBody.find_first_of("\n\r");
if (End != StringRef::npos)
Diags.Report(diag::warn_fe_macro_contains_embedded_newline)
<< MacroName;
MacroBody = MacroBody.substr(0, End);
// We handle macro bodies which end in a backslash by appending an extra
// backslash+newline. This makes sure we don't accidentally treat the
// backslash as a line continuation marker.
if (MacroBodyEndsInBackslash(MacroBody))
Builder.defineMacro(MacroName, Twine(MacroBody) + "\\\n");
else
Builder.defineMacro(MacroName, MacroBody);
} else {
// Push "macroname 1".
Builder.defineMacro(Macro);
}
}
/// AddImplicitInclude - Add an implicit \#include of the specified file to the
/// predefines buffer.
/// As these includes are generated by -include arguments the header search
/// logic is going to search relatively to the current working directory.
static void AddImplicitInclude(MacroBuilder &Builder, StringRef File) {
Builder.append(Twine("#include \"") + File + "\"");
}
static void AddImplicitIncludeMacros(MacroBuilder &Builder, StringRef File) {
Builder.append(Twine("#__include_macros \"") + File + "\"");
// Marker token to stop the __include_macros fetch loop.
Builder.append("##"); // ##?
}
/// AddImplicitIncludePTH - Add an implicit \#include using the original file
/// used to generate a PTH cache.
static void AddImplicitIncludePTH(MacroBuilder &Builder, Preprocessor &PP,
StringRef ImplicitIncludePTH) {
PTHManager *P = PP.getPTHManager();
// Null check 'P' in the corner case where it couldn't be created.
const char *OriginalFile = P ? P->getOriginalSourceFile() : nullptr;
if (!OriginalFile) {
PP.getDiagnostics().Report(diag::err_fe_pth_file_has_no_source_header)
<< ImplicitIncludePTH;
return;
}
AddImplicitInclude(Builder, OriginalFile);
}
/// \brief Add an implicit \#include using the original file used to generate
/// a PCH file.
static void AddImplicitIncludePCH(MacroBuilder &Builder, Preprocessor &PP,
const PCHContainerReader &PCHContainerRdr,
StringRef ImplicitIncludePCH) {
std::string OriginalFile =
ASTReader::getOriginalSourceFile(ImplicitIncludePCH, PP.getFileManager(),
PCHContainerRdr, PP.getDiagnostics());
if (OriginalFile.empty())
return;
AddImplicitInclude(Builder, OriginalFile);
}
/// PickFP - This is used to pick a value based on the FP semantics of the
/// specified FP model.
template <typename T>
static T PickFP(const llvm::fltSemantics *Sem, T IEEESingleVal,
T IEEEDoubleVal, T X87DoubleExtendedVal, T PPCDoubleDoubleVal,
T IEEEQuadVal) {
if (Sem == (const llvm::fltSemantics*)&llvm::APFloat::IEEEsingle())
return IEEESingleVal;
if (Sem == (const llvm::fltSemantics*)&llvm::APFloat::IEEEdouble())
return IEEEDoubleVal;
if (Sem == (const llvm::fltSemantics*)&llvm::APFloat::x87DoubleExtended())
return X87DoubleExtendedVal;
if (Sem == (const llvm::fltSemantics*)&llvm::APFloat::PPCDoubleDouble())
return PPCDoubleDoubleVal;
assert(Sem == (const llvm::fltSemantics*)&llvm::APFloat::IEEEquad());
return IEEEQuadVal;
}
static void DefineFloatMacros(MacroBuilder &Builder, StringRef Prefix,
const llvm::fltSemantics *Sem, StringRef Ext) {
const char *DenormMin, *Epsilon, *Max, *Min;
DenormMin = PickFP(Sem, "1.40129846e-45", "4.9406564584124654e-324",
"3.64519953188247460253e-4951",
"4.94065645841246544176568792868221e-324",
"6.47517511943802511092443895822764655e-4966");
int Digits = PickFP(Sem, 6, 15, 18, 31, 33);
int DecimalDigits = PickFP(Sem, 9, 17, 21, 33, 36);
Epsilon = PickFP(Sem, "1.19209290e-7", "2.2204460492503131e-16",
"1.08420217248550443401e-19",
"4.94065645841246544176568792868221e-324",
"1.92592994438723585305597794258492732e-34");
int MantissaDigits = PickFP(Sem, 24, 53, 64, 106, 113);
int Min10Exp = PickFP(Sem, -37, -307, -4931, -291, -4931);
int Max10Exp = PickFP(Sem, 38, 308, 4932, 308, 4932);
int MinExp = PickFP(Sem, -125, -1021, -16381, -968, -16381);
int MaxExp = PickFP(Sem, 128, 1024, 16384, 1024, 16384);
Min = PickFP(Sem, "1.17549435e-38", "2.2250738585072014e-308",
"3.36210314311209350626e-4932",
"2.00416836000897277799610805135016e-292",
"3.36210314311209350626267781732175260e-4932");
Max = PickFP(Sem, "3.40282347e+38", "1.7976931348623157e+308",
"1.18973149535723176502e+4932",
"1.79769313486231580793728971405301e+308",
"1.18973149535723176508575932662800702e+4932");
SmallString<32> DefPrefix;
DefPrefix = "__";
DefPrefix += Prefix;
DefPrefix += "_";
Builder.defineMacro(DefPrefix + "DENORM_MIN__", Twine(DenormMin)+Ext);
Builder.defineMacro(DefPrefix + "HAS_DENORM__");
Builder.defineMacro(DefPrefix + "DIG__", Twine(Digits));
Builder.defineMacro(DefPrefix + "DECIMAL_DIG__", Twine(DecimalDigits));
Builder.defineMacro(DefPrefix + "EPSILON__", Twine(Epsilon)+Ext);
Builder.defineMacro(DefPrefix + "HAS_INFINITY__");
Builder.defineMacro(DefPrefix + "HAS_QUIET_NAN__");
Builder.defineMacro(DefPrefix + "MANT_DIG__", Twine(MantissaDigits));
Builder.defineMacro(DefPrefix + "MAX_10_EXP__", Twine(Max10Exp));
Builder.defineMacro(DefPrefix + "MAX_EXP__", Twine(MaxExp));
Builder.defineMacro(DefPrefix + "MAX__", Twine(Max)+Ext);
Builder.defineMacro(DefPrefix + "MIN_10_EXP__","("+Twine(Min10Exp)+")");
Builder.defineMacro(DefPrefix + "MIN_EXP__", "("+Twine(MinExp)+")");
Builder.defineMacro(DefPrefix + "MIN__", Twine(Min)+Ext);
}
/// DefineTypeSize - Emit a macro to the predefines buffer that declares a macro
/// named MacroName with the max value for a type with width 'TypeWidth' a
/// signedness of 'isSigned' and with a value suffix of 'ValSuffix' (e.g. LL).
static void DefineTypeSize(const Twine &MacroName, unsigned TypeWidth,
StringRef ValSuffix, bool isSigned,
MacroBuilder &Builder) {
llvm::APInt MaxVal = isSigned ? llvm::APInt::getSignedMaxValue(TypeWidth)
: llvm::APInt::getMaxValue(TypeWidth);
Builder.defineMacro(MacroName, MaxVal.toString(10, isSigned) + ValSuffix);
}
/// DefineTypeSize - An overloaded helper that uses TargetInfo to determine
/// the width, suffix, and signedness of the given type
static void DefineTypeSize(const Twine &MacroName, TargetInfo::IntType Ty,
const TargetInfo &TI, MacroBuilder &Builder) {
DefineTypeSize(MacroName, TI.getTypeWidth(Ty), TI.getTypeConstantSuffix(Ty),
TI.isTypeSigned(Ty), Builder);
}
static void DefineFmt(const Twine &Prefix, TargetInfo::IntType Ty,
const TargetInfo &TI, MacroBuilder &Builder) {
bool IsSigned = TI.isTypeSigned(Ty);
StringRef FmtModifier = TI.getTypeFormatModifier(Ty);
for (const char *Fmt = IsSigned ? "di" : "ouxX"; *Fmt; ++Fmt) {
Builder.defineMacro(Prefix + "_FMT" + Twine(*Fmt) + "__",
Twine("\"") + FmtModifier + Twine(*Fmt) + "\"");
}
}
static void DefineType(const Twine &MacroName, TargetInfo::IntType Ty,
MacroBuilder &Builder) {
Builder.defineMacro(MacroName, TargetInfo::getTypeName(Ty));
}
static void DefineTypeWidth(StringRef MacroName, TargetInfo::IntType Ty,
const TargetInfo &TI, MacroBuilder &Builder) {
Builder.defineMacro(MacroName, Twine(TI.getTypeWidth(Ty)));
}
static void DefineTypeSizeof(StringRef MacroName, unsigned BitWidth,
const TargetInfo &TI, MacroBuilder &Builder) {
Builder.defineMacro(MacroName,
Twine(BitWidth / TI.getCharWidth()));
}
static void DefineExactWidthIntType(TargetInfo::IntType Ty,
const TargetInfo &TI,
MacroBuilder &Builder) {
int TypeWidth = TI.getTypeWidth(Ty);
bool IsSigned = TI.isTypeSigned(Ty);
// Use the target specified int64 type, when appropriate, so that [u]int64_t
// ends up being defined in terms of the correct type.
if (TypeWidth == 64)
Ty = IsSigned ? TI.getInt64Type() : TI.getUInt64Type();
const char *Prefix = IsSigned ? "__INT" : "__UINT";
DefineType(Prefix + Twine(TypeWidth) + "_TYPE__", Ty, Builder);
DefineFmt(Prefix + Twine(TypeWidth), Ty, TI, Builder);
StringRef ConstSuffix(TI.getTypeConstantSuffix(Ty));
Builder.defineMacro(Prefix + Twine(TypeWidth) + "_C_SUFFIX__", ConstSuffix);
}
static void DefineExactWidthIntTypeSize(TargetInfo::IntType Ty,
const TargetInfo &TI,
MacroBuilder &Builder) {
int TypeWidth = TI.getTypeWidth(Ty);
bool IsSigned = TI.isTypeSigned(Ty);
// Use the target specified int64 type, when appropriate, so that [u]int64_t
// ends up being defined in terms of the correct type.
if (TypeWidth == 64)
Ty = IsSigned ? TI.getInt64Type() : TI.getUInt64Type();
const char *Prefix = IsSigned ? "__INT" : "__UINT";
DefineTypeSize(Prefix + Twine(TypeWidth) + "_MAX__", Ty, TI, Builder);
}
static void DefineLeastWidthIntType(unsigned TypeWidth, bool IsSigned,
const TargetInfo &TI,
MacroBuilder &Builder) {
TargetInfo::IntType Ty = TI.getLeastIntTypeByWidth(TypeWidth, IsSigned);
if (Ty == TargetInfo::NoInt)
return;
const char *Prefix = IsSigned ? "__INT_LEAST" : "__UINT_LEAST";
DefineType(Prefix + Twine(TypeWidth) + "_TYPE__", Ty, Builder);
DefineTypeSize(Prefix + Twine(TypeWidth) + "_MAX__", Ty, TI, Builder);
DefineFmt(Prefix + Twine(TypeWidth), Ty, TI, Builder);
}
static void DefineFastIntType(unsigned TypeWidth, bool IsSigned,
const TargetInfo &TI, MacroBuilder &Builder) {
// stdint.h currently defines the fast int types as equivalent to the least
// types.
TargetInfo::IntType Ty = TI.getLeastIntTypeByWidth(TypeWidth, IsSigned);
if (Ty == TargetInfo::NoInt)
return;
const char *Prefix = IsSigned ? "__INT_FAST" : "__UINT_FAST";
DefineType(Prefix + Twine(TypeWidth) + "_TYPE__", Ty, Builder);
DefineTypeSize(Prefix + Twine(TypeWidth) + "_MAX__", Ty, TI, Builder);
DefineFmt(Prefix + Twine(TypeWidth), Ty, TI, Builder);
}
/// Get the value the ATOMIC_*_LOCK_FREE macro should have for a type with
/// the specified properties.
-static const char *getLockFreeValue(unsigned TypeWidth, unsigned InlineWidth) {
+static const char *getLockFreeValue(unsigned TypeWidth, unsigned TypeAlign,
+ unsigned InlineWidth) {
// Fully-aligned, power-of-2 sizes no larger than the inline
// width will be inlined as lock-free operations.
- // Note: we do not need to check alignment since _Atomic(T) is always
- // appropriately-aligned in clang.
- if ((TypeWidth & (TypeWidth - 1)) == 0 && TypeWidth <= InlineWidth)
+ if (TypeWidth == TypeAlign && (TypeWidth & (TypeWidth - 1)) == 0 &&
+ TypeWidth <= InlineWidth)
return "2"; // "always lock free"
// We cannot be certain what operations the lib calls might be
// able to implement as lock-free on future processors.
return "1"; // "sometimes lock free"
}
/// \brief Add definitions required for a smooth interaction between
/// Objective-C++ automated reference counting and libstdc++ (4.2).
static void AddObjCXXARCLibstdcxxDefines(const LangOptions &LangOpts,
MacroBuilder &Builder) {
Builder.defineMacro("_GLIBCXX_PREDEFINED_OBJC_ARC_IS_SCALAR");
std::string Result;
{
// Provide specializations for the __is_scalar type trait so that
// lifetime-qualified objects are not considered "scalar" types, which
// libstdc++ uses as an indicator of the presence of trivial copy, assign,
// default-construct, and destruct semantics (none of which hold for
// lifetime-qualified objects in ARC).
llvm::raw_string_ostream Out(Result);
Out << "namespace std {\n"
<< "\n"
<< "struct __true_type;\n"
<< "struct __false_type;\n"
<< "\n";
Out << "template<typename _Tp> struct __is_scalar;\n"
<< "\n";
if (LangOpts.ObjCAutoRefCount) {
Out << "template<typename _Tp>\n"
<< "struct __is_scalar<__attribute__((objc_ownership(strong))) _Tp> {\n"
<< " enum { __value = 0 };\n"
<< " typedef __false_type __type;\n"
<< "};\n"
<< "\n";
}
if (LangOpts.ObjCWeak) {
Out << "template<typename _Tp>\n"
<< "struct __is_scalar<__attribute__((objc_ownership(weak))) _Tp> {\n"
<< " enum { __value = 0 };\n"
<< " typedef __false_type __type;\n"
<< "};\n"
<< "\n";
}
if (LangOpts.ObjCAutoRefCount) {
Out << "template<typename _Tp>\n"
<< "struct __is_scalar<__attribute__((objc_ownership(autoreleasing)))"
<< " _Tp> {\n"
<< " enum { __value = 0 };\n"
<< " typedef __false_type __type;\n"
<< "};\n"
<< "\n";
}
Out << "}\n";
}
Builder.append(Result);
}
static void InitializeStandardPredefinedMacros(const TargetInfo &TI,
const LangOptions &LangOpts,
const FrontendOptions &FEOpts,
MacroBuilder &Builder) {
if (!LangOpts.MSVCCompat && !LangOpts.TraditionalCPP)
Builder.defineMacro("__STDC__");
if (LangOpts.Freestanding)
Builder.defineMacro("__STDC_HOSTED__", "0");
else
Builder.defineMacro("__STDC_HOSTED__");
if (!LangOpts.CPlusPlus) {
if (LangOpts.C11)
Builder.defineMacro("__STDC_VERSION__", "201112L");
else if (LangOpts.C99)
Builder.defineMacro("__STDC_VERSION__", "199901L");
else if (!LangOpts.GNUMode && LangOpts.Digraphs)
Builder.defineMacro("__STDC_VERSION__", "199409L");
} else {
// FIXME: Use correct value for C++17.
if (LangOpts.CPlusPlus1z)
Builder.defineMacro("__cplusplus", "201406L");
// C++1y [cpp.predefined]p1:
// The name __cplusplus is defined to the value 201402L when compiling a
// C++ translation unit.
else if (LangOpts.CPlusPlus14)
Builder.defineMacro("__cplusplus", "201402L");
// C++11 [cpp.predefined]p1:
// The name __cplusplus is defined to the value 201103L when compiling a
// C++ translation unit.
else if (LangOpts.CPlusPlus11)
Builder.defineMacro("__cplusplus", "201103L");
// C++03 [cpp.predefined]p1:
// The name __cplusplus is defined to the value 199711L when compiling a
// C++ translation unit.
else
Builder.defineMacro("__cplusplus", "199711L");
// C++1z [cpp.predefined]p1:
// An integer literal of type std::size_t whose value is the alignment
// guaranteed by a call to operator new(std::size_t)
//
// We provide this in all language modes, since it seems generally useful.
Builder.defineMacro("__STDCPP_DEFAULT_NEW_ALIGNMENT__",
Twine(TI.getNewAlign() / TI.getCharWidth()) +
TI.getTypeConstantSuffix(TI.getSizeType()));
}
// In C11 these are environment macros. In C++11 they are only defined
// as part of <cuchar>. To prevent breakage when mixing C and C++
// code, define these macros unconditionally. We can define them
// unconditionally, as Clang always uses UTF-16 and UTF-32 for 16-bit
// and 32-bit character literals.
Builder.defineMacro("__STDC_UTF_16__", "1");
Builder.defineMacro("__STDC_UTF_32__", "1");
if (LangOpts.ObjC1)
Builder.defineMacro("__OBJC__");
// OpenCL v1.0/1.1 s6.9, v1.2/2.0 s6.10: Preprocessor Directives and Macros.
if (LangOpts.OpenCL) {
// OpenCL v1.0 and v1.1 do not have a predefined macro to indicate the
// language standard with which the program is compiled. __OPENCL_VERSION__
// is for the OpenCL version supported by the OpenCL device, which is not
// necessarily the language standard with which the program is compiled.
// A shared OpenCL header file requires a macro to indicate the language
// standard. As a workaround, __OPENCL_C_VERSION__ is defined for
// OpenCL v1.0 and v1.1.
switch (LangOpts.OpenCLVersion) {
case 100:
Builder.defineMacro("__OPENCL_C_VERSION__", "100");
break;
case 110:
Builder.defineMacro("__OPENCL_C_VERSION__", "110");
break;
case 120:
Builder.defineMacro("__OPENCL_C_VERSION__", "120");
break;
case 200:
Builder.defineMacro("__OPENCL_C_VERSION__", "200");
break;
default:
llvm_unreachable("Unsupported OpenCL version");
}
Builder.defineMacro("CL_VERSION_1_0", "100");
Builder.defineMacro("CL_VERSION_1_1", "110");
Builder.defineMacro("CL_VERSION_1_2", "120");
Builder.defineMacro("CL_VERSION_2_0", "200");
if (TI.isLittleEndian())
Builder.defineMacro("__ENDIAN_LITTLE__");
if (LangOpts.FastRelaxedMath)
Builder.defineMacro("__FAST_RELAXED_MATH__");
}
// Not "standard" per se, but available even with the -undef flag.
if (LangOpts.AsmPreprocessor)
Builder.defineMacro("__ASSEMBLER__");
if (LangOpts.CUDA)
Builder.defineMacro("__CUDA__");
}
/// Initialize the predefined C++ language feature test macros defined in
/// ISO/IEC JTC1/SC22/WG21 (C++) SD-6: "SG10 Feature Test Recommendations".
static void InitializeCPlusPlusFeatureTestMacros(const LangOptions &LangOpts,
MacroBuilder &Builder) {
// C++98 features.
if (LangOpts.RTTI)
Builder.defineMacro("__cpp_rtti", "199711");
if (LangOpts.CXXExceptions)
Builder.defineMacro("__cpp_exceptions", "199711");
// C++11 features.
if (LangOpts.CPlusPlus11) {
Builder.defineMacro("__cpp_unicode_characters", "200704");
Builder.defineMacro("__cpp_raw_strings", "200710");
Builder.defineMacro("__cpp_unicode_literals", "200710");
Builder.defineMacro("__cpp_user_defined_literals", "200809");
Builder.defineMacro("__cpp_lambdas", "200907");
Builder.defineMacro("__cpp_constexpr",
LangOpts.CPlusPlus14 ? "201304" : "200704");
Builder.defineMacro("__cpp_range_based_for",
LangOpts.CPlusPlus1z ? "201603" : "200907");
Builder.defineMacro("__cpp_static_assert",
LangOpts.CPlusPlus1z ? "201411" : "200410");
Builder.defineMacro("__cpp_decltype", "200707");
Builder.defineMacro("__cpp_attributes", "200809");
Builder.defineMacro("__cpp_rvalue_references", "200610");
Builder.defineMacro("__cpp_variadic_templates", "200704");
Builder.defineMacro("__cpp_initializer_lists", "200806");
Builder.defineMacro("__cpp_delegating_constructors", "200604");
Builder.defineMacro("__cpp_nsdmi", "200809");
Builder.defineMacro("__cpp_inheriting_constructors", "201511");
Builder.defineMacro("__cpp_ref_qualifiers", "200710");
Builder.defineMacro("__cpp_alias_templates", "200704");
}
// C++14 features.
if (LangOpts.CPlusPlus14) {
Builder.defineMacro("__cpp_binary_literals", "201304");
Builder.defineMacro("__cpp_digit_separators", "201309");
Builder.defineMacro("__cpp_init_captures", "201304");
Builder.defineMacro("__cpp_generic_lambdas", "201304");
Builder.defineMacro("__cpp_decltype_auto", "201304");
Builder.defineMacro("__cpp_return_type_deduction", "201304");
Builder.defineMacro("__cpp_aggregate_nsdmi", "201304");
Builder.defineMacro("__cpp_variable_templates", "201304");
}
if (LangOpts.SizedDeallocation)
Builder.defineMacro("__cpp_sized_deallocation", "201309");
// C++17 features.
if (LangOpts.CPlusPlus1z) {
Builder.defineMacro("__cpp_hex_float", "201603");
Builder.defineMacro("__cpp_inline_variables", "201606");
Builder.defineMacro("__cpp_noexcept_function_type", "201510");
Builder.defineMacro("__cpp_capture_star_this", "201603");
Builder.defineMacro("__cpp_if_constexpr", "201606");
Builder.defineMacro("__cpp_template_auto", "201606");
Builder.defineMacro("__cpp_namespace_attributes", "201411");
Builder.defineMacro("__cpp_enumerator_attributes", "201411");
Builder.defineMacro("__cpp_nested_namespace_definitions", "201411");
Builder.defineMacro("__cpp_variadic_using", "201611");
Builder.defineMacro("__cpp_aggregate_bases", "201603");
Builder.defineMacro("__cpp_structured_bindings", "201606");
Builder.defineMacro("__cpp_nontype_template_args", "201411");
Builder.defineMacro("__cpp_fold_expressions", "201603");
}
if (LangOpts.AlignedAllocation)
Builder.defineMacro("__cpp_aligned_new", "201606");
// TS features.
if (LangOpts.ConceptsTS)
Builder.defineMacro("__cpp_experimental_concepts", "1");
if (LangOpts.CoroutinesTS)
Builder.defineMacro("__cpp_coroutines", "1");
}
static void InitializePredefinedMacros(const TargetInfo &TI,
const LangOptions &LangOpts,
const FrontendOptions &FEOpts,
MacroBuilder &Builder) {
// Compiler version introspection macros.
Builder.defineMacro("__llvm__"); // LLVM Backend
Builder.defineMacro("__clang__"); // Clang Frontend
#define TOSTR2(X) #X
#define TOSTR(X) TOSTR2(X)
Builder.defineMacro("__clang_major__", TOSTR(CLANG_VERSION_MAJOR));
Builder.defineMacro("__clang_minor__", TOSTR(CLANG_VERSION_MINOR));
Builder.defineMacro("__clang_patchlevel__", TOSTR(CLANG_VERSION_PATCHLEVEL));
#undef TOSTR
#undef TOSTR2
Builder.defineMacro("__clang_version__",
"\"" CLANG_VERSION_STRING " "
+ getClangFullRepositoryVersion() + "\"");
if (!LangOpts.MSVCCompat) {
// Currently claim to be compatible with GCC 4.2.1-5621, but only if we're
// not compiling for MSVC compatibility
Builder.defineMacro("__GNUC_MINOR__", "2");
Builder.defineMacro("__GNUC_PATCHLEVEL__", "1");
Builder.defineMacro("__GNUC__", "4");
Builder.defineMacro("__GXX_ABI_VERSION", "1002");
}
// Define macros for the C11 / C++11 memory orderings
Builder.defineMacro("__ATOMIC_RELAXED", "0");
Builder.defineMacro("__ATOMIC_CONSUME", "1");
Builder.defineMacro("__ATOMIC_ACQUIRE", "2");
Builder.defineMacro("__ATOMIC_RELEASE", "3");
Builder.defineMacro("__ATOMIC_ACQ_REL", "4");
Builder.defineMacro("__ATOMIC_SEQ_CST", "5");
// Support for #pragma redefine_extname (Sun compatibility)
Builder.defineMacro("__PRAGMA_REDEFINE_EXTNAME", "1");
// As sad as it is, enough software depends on the __VERSION__ for version
// checks that it is necessary to report 4.2.1 (the base GCC version we claim
// compatibility with) first.
Builder.defineMacro("__VERSION__", "\"4.2.1 Compatible " +
Twine(getClangFullCPPVersion()) + "\"");
// Initialize language-specific preprocessor defines.
// Standard conforming mode?
if (!LangOpts.GNUMode && !LangOpts.MSVCCompat)
Builder.defineMacro("__STRICT_ANSI__");
if (!LangOpts.MSVCCompat && LangOpts.CPlusPlus11)
Builder.defineMacro("__GXX_EXPERIMENTAL_CXX0X__");
if (LangOpts.ObjC1) {
if (LangOpts.ObjCRuntime.isNonFragile()) {
Builder.defineMacro("__OBJC2__");
if (LangOpts.ObjCExceptions)
Builder.defineMacro("OBJC_ZEROCOST_EXCEPTIONS");
}
Builder.defineMacro("__OBJC_BOOL_IS_BOOL",
Twine(TI.useSignedCharForObjCBool() ? "0" : "1"));
if (LangOpts.getGC() != LangOptions::NonGC)
Builder.defineMacro("__OBJC_GC__");
if (LangOpts.ObjCRuntime.isNeXTFamily())
Builder.defineMacro("__NEXT_RUNTIME__");
if (LangOpts.ObjCRuntime.getKind() == ObjCRuntime::ObjFW) {
VersionTuple tuple = LangOpts.ObjCRuntime.getVersion();
unsigned minor = 0;
if (tuple.getMinor().hasValue())
minor = tuple.getMinor().getValue();
unsigned subminor = 0;
if (tuple.getSubminor().hasValue())
subminor = tuple.getSubminor().getValue();
Builder.defineMacro("__OBJFW_RUNTIME_ABI__",
Twine(tuple.getMajor() * 10000 + minor * 100 +
subminor));
}
Builder.defineMacro("IBOutlet", "__attribute__((iboutlet))");
Builder.defineMacro("IBOutletCollection(ClassName)",
"__attribute__((iboutletcollection(ClassName)))");
Builder.defineMacro("IBAction", "void)__attribute__((ibaction)");
Builder.defineMacro("IBInspectable", "");
Builder.defineMacro("IB_DESIGNABLE", "");
}
if (LangOpts.CPlusPlus)
InitializeCPlusPlusFeatureTestMacros(LangOpts, Builder);
// darwin_constant_cfstrings controls this. This is also dependent
// on other things like the runtime I believe. This is set even for C code.
if (!LangOpts.NoConstantCFStrings)
Builder.defineMacro("__CONSTANT_CFSTRINGS__");
if (LangOpts.ObjC2)
Builder.defineMacro("OBJC_NEW_PROPERTIES");
if (LangOpts.PascalStrings)
Builder.defineMacro("__PASCAL_STRINGS__");
if (LangOpts.Blocks) {
Builder.defineMacro("__block", "__attribute__((__blocks__(byref)))");
Builder.defineMacro("__BLOCKS__");
}
if (!LangOpts.MSVCCompat && LangOpts.Exceptions)
Builder.defineMacro("__EXCEPTIONS");
if (!LangOpts.MSVCCompat && LangOpts.RTTI)
Builder.defineMacro("__GXX_RTTI");
if (LangOpts.SjLjExceptions)
Builder.defineMacro("__USING_SJLJ_EXCEPTIONS__");
if (LangOpts.Deprecated)
Builder.defineMacro("__DEPRECATED");
if (!LangOpts.MSVCCompat && LangOpts.CPlusPlus) {
Builder.defineMacro("__GNUG__", "4");
Builder.defineMacro("__GXX_WEAK__");
Builder.defineMacro("__private_extern__", "extern");
}
if (LangOpts.MicrosoftExt) {
if (LangOpts.WChar) {
// wchar_t supported as a keyword.
Builder.defineMacro("_WCHAR_T_DEFINED");
Builder.defineMacro("_NATIVE_WCHAR_T_DEFINED");
}
}
if (LangOpts.Optimize)
Builder.defineMacro("__OPTIMIZE__");
if (LangOpts.OptimizeSize)
Builder.defineMacro("__OPTIMIZE_SIZE__");
if (LangOpts.FastMath)
Builder.defineMacro("__FAST_MATH__");
// Initialize target-specific preprocessor defines.
// __BYTE_ORDER__ was added in GCC 4.6. It's analogous
// to the macro __BYTE_ORDER (no trailing underscores)
// from glibc's <endian.h> header.
// We don't support the PDP-11 as a target, but include
// the define so it can still be compared against.
Builder.defineMacro("__ORDER_LITTLE_ENDIAN__", "1234");
Builder.defineMacro("__ORDER_BIG_ENDIAN__", "4321");
Builder.defineMacro("__ORDER_PDP_ENDIAN__", "3412");
if (TI.isBigEndian()) {
Builder.defineMacro("__BYTE_ORDER__", "__ORDER_BIG_ENDIAN__");
Builder.defineMacro("__BIG_ENDIAN__");
} else {
Builder.defineMacro("__BYTE_ORDER__", "__ORDER_LITTLE_ENDIAN__");
Builder.defineMacro("__LITTLE_ENDIAN__");
}
if (TI.getPointerWidth(0) == 64 && TI.getLongWidth() == 64
&& TI.getIntWidth() == 32) {
Builder.defineMacro("_LP64");
Builder.defineMacro("__LP64__");
}
if (TI.getPointerWidth(0) == 32 && TI.getLongWidth() == 32
&& TI.getIntWidth() == 32) {
Builder.defineMacro("_ILP32");
Builder.defineMacro("__ILP32__");
}
// Define type sizing macros based on the target properties.
assert(TI.getCharWidth() == 8 && "Only support 8-bit char so far");
Builder.defineMacro("__CHAR_BIT__", Twine(TI.getCharWidth()));
DefineTypeSize("__SCHAR_MAX__", TargetInfo::SignedChar, TI, Builder);
DefineTypeSize("__SHRT_MAX__", TargetInfo::SignedShort, TI, Builder);
DefineTypeSize("__INT_MAX__", TargetInfo::SignedInt, TI, Builder);
DefineTypeSize("__LONG_MAX__", TargetInfo::SignedLong, TI, Builder);
DefineTypeSize("__LONG_LONG_MAX__", TargetInfo::SignedLongLong, TI, Builder);
DefineTypeSize("__WCHAR_MAX__", TI.getWCharType(), TI, Builder);
DefineTypeSize("__INTMAX_MAX__", TI.getIntMaxType(), TI, Builder);
DefineTypeSize("__SIZE_MAX__", TI.getSizeType(), TI, Builder);
DefineTypeSize("__UINTMAX_MAX__", TI.getUIntMaxType(), TI, Builder);
DefineTypeSize("__PTRDIFF_MAX__", TI.getPtrDiffType(0), TI, Builder);
DefineTypeSize("__INTPTR_MAX__", TI.getIntPtrType(), TI, Builder);
DefineTypeSize("__UINTPTR_MAX__", TI.getUIntPtrType(), TI, Builder);
DefineTypeSizeof("__SIZEOF_DOUBLE__", TI.getDoubleWidth(), TI, Builder);
DefineTypeSizeof("__SIZEOF_FLOAT__", TI.getFloatWidth(), TI, Builder);
DefineTypeSizeof("__SIZEOF_INT__", TI.getIntWidth(), TI, Builder);
DefineTypeSizeof("__SIZEOF_LONG__", TI.getLongWidth(), TI, Builder);
DefineTypeSizeof("__SIZEOF_LONG_DOUBLE__",TI.getLongDoubleWidth(),TI,Builder);
DefineTypeSizeof("__SIZEOF_LONG_LONG__", TI.getLongLongWidth(), TI, Builder);
DefineTypeSizeof("__SIZEOF_POINTER__", TI.getPointerWidth(0), TI, Builder);
DefineTypeSizeof("__SIZEOF_SHORT__", TI.getShortWidth(), TI, Builder);
DefineTypeSizeof("__SIZEOF_PTRDIFF_T__",
TI.getTypeWidth(TI.getPtrDiffType(0)), TI, Builder);
DefineTypeSizeof("__SIZEOF_SIZE_T__",
TI.getTypeWidth(TI.getSizeType()), TI, Builder);
DefineTypeSizeof("__SIZEOF_WCHAR_T__",
TI.getTypeWidth(TI.getWCharType()), TI, Builder);
DefineTypeSizeof("__SIZEOF_WINT_T__",
TI.getTypeWidth(TI.getWIntType()), TI, Builder);
if (TI.hasInt128Type())
DefineTypeSizeof("__SIZEOF_INT128__", 128, TI, Builder);
DefineType("__INTMAX_TYPE__", TI.getIntMaxType(), Builder);
DefineFmt("__INTMAX", TI.getIntMaxType(), TI, Builder);
Builder.defineMacro("__INTMAX_C_SUFFIX__",
TI.getTypeConstantSuffix(TI.getIntMaxType()));
DefineType("__UINTMAX_TYPE__", TI.getUIntMaxType(), Builder);
DefineFmt("__UINTMAX", TI.getUIntMaxType(), TI, Builder);
Builder.defineMacro("__UINTMAX_C_SUFFIX__",
TI.getTypeConstantSuffix(TI.getUIntMaxType()));
DefineTypeWidth("__INTMAX_WIDTH__", TI.getIntMaxType(), TI, Builder);
DefineType("__PTRDIFF_TYPE__", TI.getPtrDiffType(0), Builder);
DefineFmt("__PTRDIFF", TI.getPtrDiffType(0), TI, Builder);
DefineTypeWidth("__PTRDIFF_WIDTH__", TI.getPtrDiffType(0), TI, Builder);
DefineType("__INTPTR_TYPE__", TI.getIntPtrType(), Builder);
DefineFmt("__INTPTR", TI.getIntPtrType(), TI, Builder);
DefineTypeWidth("__INTPTR_WIDTH__", TI.getIntPtrType(), TI, Builder);
DefineType("__SIZE_TYPE__", TI.getSizeType(), Builder);
DefineFmt("__SIZE", TI.getSizeType(), TI, Builder);
DefineTypeWidth("__SIZE_WIDTH__", TI.getSizeType(), TI, Builder);
DefineType("__WCHAR_TYPE__", TI.getWCharType(), Builder);
DefineTypeWidth("__WCHAR_WIDTH__", TI.getWCharType(), TI, Builder);
DefineType("__WINT_TYPE__", TI.getWIntType(), Builder);
DefineTypeWidth("__WINT_WIDTH__", TI.getWIntType(), TI, Builder);
DefineTypeWidth("__SIG_ATOMIC_WIDTH__", TI.getSigAtomicType(), TI, Builder);
DefineTypeSize("__SIG_ATOMIC_MAX__", TI.getSigAtomicType(), TI, Builder);
DefineType("__CHAR16_TYPE__", TI.getChar16Type(), Builder);
DefineType("__CHAR32_TYPE__", TI.getChar32Type(), Builder);
DefineTypeWidth("__UINTMAX_WIDTH__", TI.getUIntMaxType(), TI, Builder);
DefineType("__UINTPTR_TYPE__", TI.getUIntPtrType(), Builder);
DefineFmt("__UINTPTR", TI.getUIntPtrType(), TI, Builder);
DefineTypeWidth("__UINTPTR_WIDTH__", TI.getUIntPtrType(), TI, Builder);
DefineFloatMacros(Builder, "FLT", &TI.getFloatFormat(), "F");
DefineFloatMacros(Builder, "DBL", &TI.getDoubleFormat(), "");
DefineFloatMacros(Builder, "LDBL", &TI.getLongDoubleFormat(), "L");
// Define a __POINTER_WIDTH__ macro for stdint.h.
Builder.defineMacro("__POINTER_WIDTH__",
Twine((int)TI.getPointerWidth(0)));
// Define __BIGGEST_ALIGNMENT__ to be compatible with gcc.
Builder.defineMacro("__BIGGEST_ALIGNMENT__",
Twine(TI.getSuitableAlign() / TI.getCharWidth()) );
if (!LangOpts.CharIsSigned)
Builder.defineMacro("__CHAR_UNSIGNED__");
if (!TargetInfo::isTypeSigned(TI.getWCharType()))
Builder.defineMacro("__WCHAR_UNSIGNED__");
if (!TargetInfo::isTypeSigned(TI.getWIntType()))
Builder.defineMacro("__WINT_UNSIGNED__");
// Define exact-width integer types for stdint.h
DefineExactWidthIntType(TargetInfo::SignedChar, TI, Builder);
if (TI.getShortWidth() > TI.getCharWidth())
DefineExactWidthIntType(TargetInfo::SignedShort, TI, Builder);
if (TI.getIntWidth() > TI.getShortWidth())
DefineExactWidthIntType(TargetInfo::SignedInt, TI, Builder);
if (TI.getLongWidth() > TI.getIntWidth())
DefineExactWidthIntType(TargetInfo::SignedLong, TI, Builder);
if (TI.getLongLongWidth() > TI.getLongWidth())
DefineExactWidthIntType(TargetInfo::SignedLongLong, TI, Builder);
DefineExactWidthIntType(TargetInfo::UnsignedChar, TI, Builder);
DefineExactWidthIntTypeSize(TargetInfo::UnsignedChar, TI, Builder);
DefineExactWidthIntTypeSize(TargetInfo::SignedChar, TI, Builder);
if (TI.getShortWidth() > TI.getCharWidth()) {
DefineExactWidthIntType(TargetInfo::UnsignedShort, TI, Builder);
DefineExactWidthIntTypeSize(TargetInfo::UnsignedShort, TI, Builder);
DefineExactWidthIntTypeSize(TargetInfo::SignedShort, TI, Builder);
}
if (TI.getIntWidth() > TI.getShortWidth()) {
DefineExactWidthIntType(TargetInfo::UnsignedInt, TI, Builder);
DefineExactWidthIntTypeSize(TargetInfo::UnsignedInt, TI, Builder);
DefineExactWidthIntTypeSize(TargetInfo::SignedInt, TI, Builder);
}
if (TI.getLongWidth() > TI.getIntWidth()) {
DefineExactWidthIntType(TargetInfo::UnsignedLong, TI, Builder);
DefineExactWidthIntTypeSize(TargetInfo::UnsignedLong, TI, Builder);
DefineExactWidthIntTypeSize(TargetInfo::SignedLong, TI, Builder);
}
if (TI.getLongLongWidth() > TI.getLongWidth()) {
DefineExactWidthIntType(TargetInfo::UnsignedLongLong, TI, Builder);
DefineExactWidthIntTypeSize(TargetInfo::UnsignedLongLong, TI, Builder);
DefineExactWidthIntTypeSize(TargetInfo::SignedLongLong, TI, Builder);
}
DefineLeastWidthIntType(8, true, TI, Builder);
DefineLeastWidthIntType(8, false, TI, Builder);
DefineLeastWidthIntType(16, true, TI, Builder);
DefineLeastWidthIntType(16, false, TI, Builder);
DefineLeastWidthIntType(32, true, TI, Builder);
DefineLeastWidthIntType(32, false, TI, Builder);
DefineLeastWidthIntType(64, true, TI, Builder);
DefineLeastWidthIntType(64, false, TI, Builder);
DefineFastIntType(8, true, TI, Builder);
DefineFastIntType(8, false, TI, Builder);
DefineFastIntType(16, true, TI, Builder);
DefineFastIntType(16, false, TI, Builder);
DefineFastIntType(32, true, TI, Builder);
DefineFastIntType(32, false, TI, Builder);
DefineFastIntType(64, true, TI, Builder);
DefineFastIntType(64, false, TI, Builder);
char UserLabelPrefix[2] = {TI.getDataLayout().getGlobalPrefix(), 0};
Builder.defineMacro("__USER_LABEL_PREFIX__", UserLabelPrefix);
if (LangOpts.FastMath || LangOpts.FiniteMathOnly)
Builder.defineMacro("__FINITE_MATH_ONLY__", "1");
else
Builder.defineMacro("__FINITE_MATH_ONLY__", "0");
if (!LangOpts.MSVCCompat) {
if (LangOpts.GNUInline || LangOpts.CPlusPlus)
Builder.defineMacro("__GNUC_GNU_INLINE__");
else
Builder.defineMacro("__GNUC_STDC_INLINE__");
// The value written by __atomic_test_and_set.
// FIXME: This is target-dependent.
Builder.defineMacro("__GCC_ATOMIC_TEST_AND_SET_TRUEVAL", "1");
// Used by libc++ and libstdc++ to implement ATOMIC_<foo>_LOCK_FREE.
unsigned InlineWidthBits = TI.getMaxAtomicInlineWidth();
#define DEFINE_LOCK_FREE_MACRO(TYPE, Type) \
Builder.defineMacro("__GCC_ATOMIC_" #TYPE "_LOCK_FREE", \
getLockFreeValue(TI.get##Type##Width(), \
+ TI.get##Type##Align(), \
InlineWidthBits));
DEFINE_LOCK_FREE_MACRO(BOOL, Bool);
DEFINE_LOCK_FREE_MACRO(CHAR, Char);
DEFINE_LOCK_FREE_MACRO(CHAR16_T, Char16);
DEFINE_LOCK_FREE_MACRO(CHAR32_T, Char32);
DEFINE_LOCK_FREE_MACRO(WCHAR_T, WChar);
DEFINE_LOCK_FREE_MACRO(SHORT, Short);
DEFINE_LOCK_FREE_MACRO(INT, Int);
DEFINE_LOCK_FREE_MACRO(LONG, Long);
DEFINE_LOCK_FREE_MACRO(LLONG, LongLong);
Builder.defineMacro("__GCC_ATOMIC_POINTER_LOCK_FREE",
getLockFreeValue(TI.getPointerWidth(0),
+ TI.getPointerAlign(0),
InlineWidthBits));
#undef DEFINE_LOCK_FREE_MACRO
}
if (LangOpts.NoInlineDefine)
Builder.defineMacro("__NO_INLINE__");
if (unsigned PICLevel = LangOpts.PICLevel) {
Builder.defineMacro("__PIC__", Twine(PICLevel));
Builder.defineMacro("__pic__", Twine(PICLevel));
if (LangOpts.PIE) {
Builder.defineMacro("__PIE__", Twine(PICLevel));
Builder.defineMacro("__pie__", Twine(PICLevel));
}
}
// Macros to control C99 numerics and <float.h>
Builder.defineMacro("__FLT_EVAL_METHOD__", Twine(TI.getFloatEvalMethod()));
Builder.defineMacro("__FLT_RADIX__", "2");
Builder.defineMacro("__DECIMAL_DIG__", "__LDBL_DECIMAL_DIG__");
if (LangOpts.getStackProtector() == LangOptions::SSPOn)
Builder.defineMacro("__SSP__");
else if (LangOpts.getStackProtector() == LangOptions::SSPStrong)
Builder.defineMacro("__SSP_STRONG__", "2");
else if (LangOpts.getStackProtector() == LangOptions::SSPReq)
Builder.defineMacro("__SSP_ALL__", "3");
// Define a macro that exists only when using the static analyzer.
if (FEOpts.ProgramAction == frontend::RunAnalysis)
Builder.defineMacro("__clang_analyzer__");
if (LangOpts.FastRelaxedMath)
Builder.defineMacro("__FAST_RELAXED_MATH__");
if (FEOpts.ProgramAction == frontend::RewriteObjC ||
LangOpts.getGC() != LangOptions::NonGC) {
Builder.defineMacro("__weak", "__attribute__((objc_gc(weak)))");
Builder.defineMacro("__strong", "__attribute__((objc_gc(strong)))");
Builder.defineMacro("__autoreleasing", "");
Builder.defineMacro("__unsafe_unretained", "");
} else if (LangOpts.ObjC1) {
Builder.defineMacro("__weak", "__attribute__((objc_ownership(weak)))");
Builder.defineMacro("__strong", "__attribute__((objc_ownership(strong)))");
Builder.defineMacro("__autoreleasing",
"__attribute__((objc_ownership(autoreleasing)))");
Builder.defineMacro("__unsafe_unretained",
"__attribute__((objc_ownership(none)))");
}
// On Darwin, there are __double_underscored variants of the type
// nullability qualifiers.
if (TI.getTriple().isOSDarwin()) {
Builder.defineMacro("__nonnull", "_Nonnull");
Builder.defineMacro("__null_unspecified", "_Null_unspecified");
Builder.defineMacro("__nullable", "_Nullable");
}
// OpenMP definition
// OpenMP 2.2:
// In implementations that support a preprocessor, the _OPENMP
// macro name is defined to have the decimal value yyyymm where
// yyyy and mm are the year and the month designations of the
// version of the OpenMP API that the implementation support.
switch (LangOpts.OpenMP) {
case 0:
break;
case 40:
Builder.defineMacro("_OPENMP", "201307");
break;
case 45:
Builder.defineMacro("_OPENMP", "201511");
break;
default:
// Default version is OpenMP 3.1
Builder.defineMacro("_OPENMP", "201107");
break;
}
// CUDA device path compilaton
if (LangOpts.CUDAIsDevice) {
// The CUDA_ARCH value is set for the GPU target specified in the NVPTX
// backend's target defines.
Builder.defineMacro("__CUDA_ARCH__");
}
// We need to communicate this to our CUDA header wrapper, which in turn
// informs the proper CUDA headers of this choice.
if (LangOpts.CUDADeviceApproxTranscendentals || LangOpts.FastMath) {
Builder.defineMacro("__CLANG_CUDA_APPROX_TRANSCENDENTALS__");
}
// OpenCL definitions.
if (LangOpts.OpenCL) {
#define OPENCLEXT(Ext) \
if (TI.getSupportedOpenCLOpts().isSupported(#Ext, \
LangOpts.OpenCLVersion)) \
Builder.defineMacro(#Ext);
#include "clang/Basic/OpenCLExtensions.def"
}
if (TI.hasInt128Type() && LangOpts.CPlusPlus && LangOpts.GNUMode) {
// For each extended integer type, g++ defines a macro mapping the
// index of the type (0 in this case) in some list of extended types
// to the type.
Builder.defineMacro("__GLIBCXX_TYPE_INT_N_0", "__int128");
Builder.defineMacro("__GLIBCXX_BITSIZE_INT_N_0", "128");
}
// Get other target #defines.
TI.getTargetDefines(LangOpts, Builder);
}
/// InitializePreprocessor - Initialize the preprocessor getting it and the
/// environment ready to process a single file. This returns true on error.
///
void clang::InitializePreprocessor(
Preprocessor &PP, const PreprocessorOptions &InitOpts,
const PCHContainerReader &PCHContainerRdr,
const FrontendOptions &FEOpts) {
const LangOptions &LangOpts = PP.getLangOpts();
std::string PredefineBuffer;
PredefineBuffer.reserve(4080);
llvm::raw_string_ostream Predefines(PredefineBuffer);
MacroBuilder Builder(Predefines);
// Emit line markers for various builtin sections of the file. We don't do
// this in asm preprocessor mode, because "# 4" is not a line marker directive
// in this mode.
if (!PP.getLangOpts().AsmPreprocessor)
Builder.append("# 1 \"<built-in>\" 3");
// Install things like __POWERPC__, __GNUC__, etc into the macro table.
if (InitOpts.UsePredefines) {
if (LangOpts.CUDA && PP.getAuxTargetInfo())
InitializePredefinedMacros(*PP.getAuxTargetInfo(), LangOpts, FEOpts,
Builder);
InitializePredefinedMacros(PP.getTargetInfo(), LangOpts, FEOpts, Builder);
// Install definitions to make Objective-C++ ARC work well with various
// C++ Standard Library implementations.
if (LangOpts.ObjC1 && LangOpts.CPlusPlus &&
(LangOpts.ObjCAutoRefCount || LangOpts.ObjCWeak)) {
switch (InitOpts.ObjCXXARCStandardLibrary) {
case ARCXX_nolib:
case ARCXX_libcxx:
break;
case ARCXX_libstdcxx:
AddObjCXXARCLibstdcxxDefines(LangOpts, Builder);
break;
}
}
}
// Even with predefines off, some macros are still predefined.
// These should all be defined in the preprocessor according to the
// current language configuration.
InitializeStandardPredefinedMacros(PP.getTargetInfo(), PP.getLangOpts(),
FEOpts, Builder);
// Add on the predefines from the driver. Wrap in a #line directive to report
// that they come from the command line.
if (!PP.getLangOpts().AsmPreprocessor)
Builder.append("# 1 \"<command line>\" 1");
// Process #define's and #undef's in the order they are given.
for (unsigned i = 0, e = InitOpts.Macros.size(); i != e; ++i) {
if (InitOpts.Macros[i].second) // isUndef
Builder.undefineMacro(InitOpts.Macros[i].first);
else
DefineBuiltinMacro(Builder, InitOpts.Macros[i].first,
PP.getDiagnostics());
}
// Exit the command line and go back to <built-in> (2 is LC_LEAVE).
if (!PP.getLangOpts().AsmPreprocessor)
Builder.append("# 1 \"<built-in>\" 2");
// If -imacros are specified, include them now. These are processed before
// any -include directives.
for (unsigned i = 0, e = InitOpts.MacroIncludes.size(); i != e; ++i)
AddImplicitIncludeMacros(Builder, InitOpts.MacroIncludes[i]);
// Process -include-pch/-include-pth directives.
if (!InitOpts.ImplicitPCHInclude.empty())
AddImplicitIncludePCH(Builder, PP, PCHContainerRdr,
InitOpts.ImplicitPCHInclude);
if (!InitOpts.ImplicitPTHInclude.empty())
AddImplicitIncludePTH(Builder, PP, InitOpts.ImplicitPTHInclude);
// Process -include directives.
for (unsigned i = 0, e = InitOpts.Includes.size(); i != e; ++i) {
const std::string &Path = InitOpts.Includes[i];
AddImplicitInclude(Builder, Path);
}
// Instruct the preprocessor to skip the preamble.
PP.setSkipMainFilePreamble(InitOpts.PrecompiledPreambleBytes.first,
InitOpts.PrecompiledPreambleBytes.second);
// Copy PredefinedBuffer into the Preprocessor.
PP.setPredefines(Predefines.str());
}
Index: projects/clang400-import/contrib/llvm/tools/clang/lib/StaticAnalyzer/Checkers/VirtualCallChecker.cpp
===================================================================
--- projects/clang400-import/contrib/llvm/tools/clang/lib/StaticAnalyzer/Checkers/VirtualCallChecker.cpp (revision 314268)
+++ projects/clang400-import/contrib/llvm/tools/clang/lib/StaticAnalyzer/Checkers/VirtualCallChecker.cpp (revision 314269)
@@ -1,291 +1,292 @@
//=======- VirtualCallChecker.cpp --------------------------------*- C++ -*-==//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
// This file defines a checker that checks virtual function calls during
// construction or destruction of C++ objects.
//
//===----------------------------------------------------------------------===//
#include "ClangSACheckers.h"
#include "clang/AST/DeclCXX.h"
#include "clang/AST/StmtVisitor.h"
#include "clang/StaticAnalyzer/Core/BugReporter/BugReporter.h"
#include "clang/StaticAnalyzer/Core/Checker.h"
#include "clang/StaticAnalyzer/Core/PathSensitive/AnalysisManager.h"
#include "llvm/ADT/SmallString.h"
#include "llvm/Support/SaveAndRestore.h"
#include "llvm/Support/raw_ostream.h"
using namespace clang;
using namespace ento;
namespace {
class WalkAST : public StmtVisitor<WalkAST> {
const CheckerBase *Checker;
BugReporter &BR;
AnalysisDeclContext *AC;
/// The root constructor or destructor whose callees are being analyzed.
const CXXMethodDecl *RootMethod = nullptr;
/// Whether the checker should walk into bodies of called functions.
/// Controlled by the "Interprocedural" analyzer-config option.
bool IsInterprocedural = false;
/// Whether the checker should only warn for calls to pure virtual functions
/// (which is undefined behavior) or for all virtual functions (which may
/// may result in unexpected behavior).
bool ReportPureOnly = false;
typedef const CallExpr * WorkListUnit;
typedef SmallVector<WorkListUnit, 20> DFSWorkList;
/// A vector representing the worklist which has a chain of CallExprs.
DFSWorkList WList;
// PreVisited : A CallExpr to this FunctionDecl is in the worklist, but the
// body has not been visited yet.
// PostVisited : A CallExpr to this FunctionDecl is in the worklist, and the
// body has been visited.
enum Kind { NotVisited,
PreVisited, /**< A CallExpr to this FunctionDecl is in the
worklist, but the body has not yet been
visited. */
PostVisited /**< A CallExpr to this FunctionDecl is in the
worklist, and the body has been visited. */
};
/// A DenseMap that records visited states of FunctionDecls.
llvm::DenseMap<const FunctionDecl *, Kind> VisitedFunctions;
/// The CallExpr whose body is currently being visited. This is used for
/// generating bug reports. This is null while visiting the body of a
/// constructor or destructor.
const CallExpr *visitingCallExpr;
public:
WalkAST(const CheckerBase *checker, BugReporter &br, AnalysisDeclContext *ac,
const CXXMethodDecl *rootMethod, bool isInterprocedural,
bool reportPureOnly)
: Checker(checker), BR(br), AC(ac), RootMethod(rootMethod),
IsInterprocedural(isInterprocedural), ReportPureOnly(reportPureOnly),
visitingCallExpr(nullptr) {
// Walking should always start from either a constructor or a destructor.
assert(isa<CXXConstructorDecl>(rootMethod) ||
isa<CXXDestructorDecl>(rootMethod));
}
bool hasWork() const { return !WList.empty(); }
/// This method adds a CallExpr to the worklist and marks the callee as
/// being PreVisited.
void Enqueue(WorkListUnit WLUnit) {
const FunctionDecl *FD = WLUnit->getDirectCallee();
if (!FD || !FD->getBody())
return;
Kind &K = VisitedFunctions[FD];
if (K != NotVisited)
return;
K = PreVisited;
WList.push_back(WLUnit);
}
/// This method returns an item from the worklist without removing it.
WorkListUnit Dequeue() {
assert(!WList.empty());
return WList.back();
}
void Execute() {
while (hasWork()) {
WorkListUnit WLUnit = Dequeue();
const FunctionDecl *FD = WLUnit->getDirectCallee();
assert(FD && FD->getBody());
if (VisitedFunctions[FD] == PreVisited) {
// If the callee is PreVisited, walk its body.
// Visit the body.
SaveAndRestore<const CallExpr *> SaveCall(visitingCallExpr, WLUnit);
Visit(FD->getBody());
// Mark the function as being PostVisited to indicate we have
// scanned the body.
VisitedFunctions[FD] = PostVisited;
continue;
}
// Otherwise, the callee is PostVisited.
// Remove it from the worklist.
assert(VisitedFunctions[FD] == PostVisited);
WList.pop_back();
}
}
// Stmt visitor methods.
void VisitCallExpr(CallExpr *CE);
void VisitCXXMemberCallExpr(CallExpr *CE);
void VisitStmt(Stmt *S) { VisitChildren(S); }
void VisitChildren(Stmt *S);
void ReportVirtualCall(const CallExpr *CE, bool isPure);
};
} // end anonymous namespace
//===----------------------------------------------------------------------===//
// AST walking.
//===----------------------------------------------------------------------===//
void WalkAST::VisitChildren(Stmt *S) {
for (Stmt *Child : S->children())
if (Child)
Visit(Child);
}
void WalkAST::VisitCallExpr(CallExpr *CE) {
VisitChildren(CE);
if (IsInterprocedural)
Enqueue(CE);
}
void WalkAST::VisitCXXMemberCallExpr(CallExpr *CE) {
VisitChildren(CE);
bool callIsNonVirtual = false;
// Several situations to elide for checking.
if (MemberExpr *CME = dyn_cast<MemberExpr>(CE->getCallee())) {
// If the member access is fully qualified (i.e., X::F), then treat
// this as a non-virtual call and do not warn.
if (CME->getQualifier())
callIsNonVirtual = true;
if (Expr *base = CME->getBase()->IgnoreImpCasts()) {
// Elide analyzing the call entirely if the base pointer is not 'this'.
if (!isa<CXXThisExpr>(base))
return;
// If the most derived class is marked final, we know that now subclass
// can override this member.
if (base->getBestDynamicClassType()->hasAttr<FinalAttr>())
callIsNonVirtual = true;
}
}
// Get the callee.
- const CXXMethodDecl *MD = dyn_cast<CXXMethodDecl>(CE->getDirectCallee());
+ const CXXMethodDecl *MD =
+ dyn_cast_or_null<CXXMethodDecl>(CE->getDirectCallee());
if (MD && MD->isVirtual() && !callIsNonVirtual && !MD->hasAttr<FinalAttr>() &&
!MD->getParent()->hasAttr<FinalAttr>())
ReportVirtualCall(CE, MD->isPure());
if (IsInterprocedural)
Enqueue(CE);
}
void WalkAST::ReportVirtualCall(const CallExpr *CE, bool isPure) {
if (ReportPureOnly && !isPure)
return;
SmallString<100> buf;
llvm::raw_svector_ostream os(buf);
// FIXME: The interprocedural diagnostic experience here is not good.
// Ultimately this checker should be re-written to be path sensitive.
// For now, only diagnose intraprocedurally, by default.
if (IsInterprocedural) {
os << "Call Path : ";
// Name of current visiting CallExpr.
os << *CE->getDirectCallee();
// Name of the CallExpr whose body is current being walked.
if (visitingCallExpr)
os << " <-- " << *visitingCallExpr->getDirectCallee();
// Names of FunctionDecls in worklist with state PostVisited.
for (SmallVectorImpl<const CallExpr *>::iterator I = WList.end(),
E = WList.begin(); I != E; --I) {
const FunctionDecl *FD = (*(I-1))->getDirectCallee();
assert(FD);
if (VisitedFunctions[FD] == PostVisited)
os << " <-- " << *FD;
}
os << "\n";
}
PathDiagnosticLocation CELoc =
PathDiagnosticLocation::createBegin(CE, BR.getSourceManager(), AC);
SourceRange R = CE->getCallee()->getSourceRange();
os << "Call to ";
if (isPure)
os << "pure ";
os << "virtual function during ";
if (isa<CXXConstructorDecl>(RootMethod))
os << "construction ";
else
os << "destruction ";
if (isPure)
os << "has undefined behavior";
else
os << "will not dispatch to derived class";
BR.EmitBasicReport(AC->getDecl(), Checker,
"Call to virtual function during construction or "
"destruction",
"C++ Object Lifecycle", os.str(), CELoc, R);
}
//===----------------------------------------------------------------------===//
// VirtualCallChecker
//===----------------------------------------------------------------------===//
namespace {
class VirtualCallChecker : public Checker<check::ASTDecl<CXXRecordDecl> > {
public:
DefaultBool isInterprocedural;
DefaultBool isPureOnly;
void checkASTDecl(const CXXRecordDecl *RD, AnalysisManager& mgr,
BugReporter &BR) const {
AnalysisDeclContext *ADC = mgr.getAnalysisDeclContext(RD);
// Check the constructors.
for (const auto *I : RD->ctors()) {
if (!I->isCopyOrMoveConstructor())
if (Stmt *Body = I->getBody()) {
WalkAST walker(this, BR, ADC, I, isInterprocedural, isPureOnly);
walker.Visit(Body);
walker.Execute();
}
}
// Check the destructor.
if (CXXDestructorDecl *DD = RD->getDestructor())
if (Stmt *Body = DD->getBody()) {
WalkAST walker(this, BR, ADC, DD, isInterprocedural, isPureOnly);
walker.Visit(Body);
walker.Execute();
}
}
};
}
void ento::registerVirtualCallChecker(CheckerManager &mgr) {
VirtualCallChecker *checker = mgr.registerChecker<VirtualCallChecker>();
checker->isInterprocedural =
mgr.getAnalyzerOptions().getBooleanOption("Interprocedural", false,
checker);
checker->isPureOnly =
mgr.getAnalyzerOptions().getBooleanOption("PureOnly", false,
checker);
}
Index: projects/clang400-import/contrib/llvm/tools/clang
===================================================================
--- projects/clang400-import/contrib/llvm/tools/clang (revision 314268)
+++ projects/clang400-import/contrib/llvm/tools/clang (revision 314269)
Property changes on: projects/clang400-import/contrib/llvm/tools/clang
___________________________________________________________________
Modified: svn:mergeinfo
## -0,0 +0,1 ##
Merged /vendor/clang/dist:r314176-314267
Index: projects/clang400-import/contrib/llvm/tools/lld
===================================================================
--- projects/clang400-import/contrib/llvm/tools/lld (revision 314268)
+++ projects/clang400-import/contrib/llvm/tools/lld (revision 314269)
Property changes on: projects/clang400-import/contrib/llvm/tools/lld
___________________________________________________________________
Modified: svn:mergeinfo
## -0,0 +0,1 ##
Merged /vendor/lld/dist:r314176-314268
Index: projects/clang400-import/contrib/llvm/tools/lldb
===================================================================
--- projects/clang400-import/contrib/llvm/tools/lldb (revision 314268)
+++ projects/clang400-import/contrib/llvm/tools/lldb (revision 314269)
Property changes on: projects/clang400-import/contrib/llvm/tools/lldb
___________________________________________________________________
Modified: svn:mergeinfo
## -0,0 +0,1 ##
Merged /vendor/lldb/dist:r314176-314268
Index: projects/clang400-import/contrib/llvm
===================================================================
--- projects/clang400-import/contrib/llvm (revision 314268)
+++ projects/clang400-import/contrib/llvm (revision 314269)
Property changes on: projects/clang400-import/contrib/llvm
___________________________________________________________________
Modified: svn:mergeinfo
## -0,0 +0,1 ##
Merged /vendor/llvm/dist:r314176-314267
Index: projects/clang400-import/lib/clang/include/clang/Basic/Version.inc
===================================================================
--- projects/clang400-import/lib/clang/include/clang/Basic/Version.inc (revision 314268)
+++ projects/clang400-import/lib/clang/include/clang/Basic/Version.inc (revision 314269)
@@ -1,11 +1,11 @@
/* $FreeBSD$ */
#define CLANG_VERSION 4.0.0
#define CLANG_VERSION_STRING "4.0.0"
#define CLANG_VERSION_MAJOR 4
#define CLANG_VERSION_MINOR 0
#define CLANG_VERSION_PATCHLEVEL 0
#define CLANG_VENDOR "FreeBSD "
-#define SVN_REVISION "296002"
+#define SVN_REVISION "296202"
Index: projects/clang400-import/lib/clang/include/lld/Config/Version.inc
===================================================================
--- projects/clang400-import/lib/clang/include/lld/Config/Version.inc (revision 314268)
+++ projects/clang400-import/lib/clang/include/lld/Config/Version.inc (revision 314269)
@@ -1,8 +1,8 @@
// $FreeBSD$
#define LLD_VERSION 4.0.0
#define LLD_VERSION_STRING "4.0.0"
#define LLD_VERSION_MAJOR 4
#define LLD_VERSION_MINOR 0
-#define LLD_REVISION_STRING "296002"
+#define LLD_REVISION_STRING "296202"
#define LLD_REPOSITORY_STRING "FreeBSD"
File Metadata
Details
Attached
Mime Type
text/x-c++
Expires
Fri, Nov 7, 7:01 AM (2 d)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
24918614
Default Alt Text
(706 KB)
Attached To
Mode
rS FreeBSD src repository - subversion
Attached
Detach File
Event Timeline
Log In to Comment