No OneTemporary
Actions

Size

706 KB

Referenced Files

None

Subscribers

None

View Options

This file is larger than 256 KB, so syntax highlighting was skipped.

	Index: projects/clang400-import/contrib/compiler-rt
	===================================================================
	--- projects/clang400-import/contrib/compiler-rt (revision 314268)
	+++ projects/clang400-import/contrib/compiler-rt (revision 314269)

	Property changes on: projects/clang400-import/contrib/compiler-rt
	___________________________________________________________________
	Modified: svn:mergeinfo
	## -0,0 +0,1 ##
	Merged /vendor/compiler-rt/dist:r314177-314268
	Index: projects/clang400-import/contrib/libc++
	===================================================================
	--- projects/clang400-import/contrib/libc++ (revision 314268)
	+++ projects/clang400-import/contrib/libc++ (revision 314269)

	Property changes on: projects/clang400-import/contrib/libc++
	___________________________________________________________________
	Modified: svn:mergeinfo
	## -0,0 +0,1 ##
	Merged /vendor/libc++/dist:r314177-314268
	Index: projects/clang400-import/contrib/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h
	===================================================================
	--- projects/clang400-import/contrib/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h (revision 314268)
	+++ projects/clang400-import/contrib/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h (revision 314269)
	@@ -1,118 +1,112 @@
	//===---- SLPVectorizer.h ---------------------------------------- C++ --===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	// This pass implements the Bottom Up SLP vectorizer. It detects consecutive
	// stores that can be put together into vector-stores. Next, it attempts to
	// construct vectorizable tree using the use-def chains. If a profitable tree
	// was found, the SLP vectorizer performs vectorization on the tree.
	//
	// The pass is inspired by the work described in the paper:
	// "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks.
	//
	//===----------------------------------------------------------------------===//

	#ifndef LLVM_TRANSFORMS_VECTORIZE_SLPVECTORIZER_H
	#define LLVM_TRANSFORMS_VECTORIZE_SLPVECTORIZER_H

	#include "llvm/ADT/MapVector.h"
	#include "llvm/Analysis/AliasAnalysis.h"
	#include "llvm/Analysis/AssumptionCache.h"
	#include "llvm/Analysis/DemandedBits.h"
	#include "llvm/Analysis/LoopInfo.h"
	#include "llvm/Analysis/ScalarEvolution.h"
	#include "llvm/Analysis/TargetTransformInfo.h"
	#include "llvm/IR/Function.h"
	#include "llvm/IR/PassManager.h"

	namespace llvm {

	/// A private "module" namespace for types and utilities used by this pass.
	/// These are implementation details and should not be used by clients.
	namespace slpvectorizer {
	class BoUpSLP;
	}

	struct SLPVectorizerPass : public PassInfoMixin<SLPVectorizerPass> {
	typedef SmallVector<StoreInst *, 8> StoreList;
	typedef MapVector<Value *, StoreList> StoreListMap;
	typedef SmallVector<WeakVH, 8> WeakVHList;
	typedef MapVector<Value *, WeakVHList> WeakVHListMap;

	ScalarEvolution *SE = nullptr;
	TargetTransformInfo *TTI = nullptr;
	TargetLibraryInfo *TLI = nullptr;
	AliasAnalysis *AA = nullptr;
	LoopInfo *LI = nullptr;
	DominatorTree *DT = nullptr;
	AssumptionCache *AC = nullptr;
	DemandedBits *DB = nullptr;
	const DataLayout *DL = nullptr;

	public:
	PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);

	// Glue for old PM.
	bool runImpl(Function &F, ScalarEvolution SE_, TargetTransformInfo TTI_,
	TargetLibraryInfo TLI_, AliasAnalysis AA_, LoopInfo *LI_,
	DominatorTree DT_, AssumptionCache AC_, DemandedBits *DB_);

	private:
	/// \brief Collect store and getelementptr instructions and organize them
	/// according to the underlying object of their pointer operands. We sort the
	/// instructions by their underlying objects to reduce the cost of
	/// consecutive access queries.
	///
	/// TODO: We can further reduce this cost if we flush the chain creation
	/// every time we run into a memory barrier.
	void collectSeedInstructions(BasicBlock *BB);

	/// \brief Try to vectorize a chain that starts at two arithmetic instrs.
	bool tryToVectorizePair(Value A, Value B, slpvectorizer::BoUpSLP &R);

	/// \brief Try to vectorize a list of operands.
	/// \@param BuildVector A list of users to ignore for the purpose of
	/// scheduling and that don't need extracting.
	/// \returns true if a value was vectorized.
	bool tryToVectorizeList(ArrayRef<Value *> VL, slpvectorizer::BoUpSLP &R,
	ArrayRef<Value *> BuildVector = None,
	bool AllowReorder = false);

	/// \brief Try to vectorize a chain that may start at the operands of \V;
	bool tryToVectorize(BinaryOperator *V, slpvectorizer::BoUpSLP &R);

	/// \brief Vectorize the store instructions collected in Stores.
	bool vectorizeStoreChains(slpvectorizer::BoUpSLP &R);

	/// \brief Vectorize the index computations of the getelementptr instructions
	/// collected in GEPs.
	bool vectorizeGEPIndices(BasicBlock *BB, slpvectorizer::BoUpSLP &R);

	- /// Try to find horizontal reduction or otherwise vectorize a chain of binary
	- /// operators.
	- bool vectorizeRootInstruction(PHINode P, Value V, BasicBlock *BB,
	- slpvectorizer::BoUpSLP &R,
	- TargetTransformInfo *TTI);
	-
	/// \brief Scan the basic block and look for patterns that are likely to start
	/// a vectorization chain.
	bool vectorizeChainsInBlock(BasicBlock *BB, slpvectorizer::BoUpSLP &R);

	bool vectorizeStoreChain(ArrayRef<Value *> Chain, slpvectorizer::BoUpSLP &R,
	unsigned VecRegSize);

	bool vectorizeStores(ArrayRef<StoreInst *> Stores, slpvectorizer::BoUpSLP &R);

	/// The store instructions in a basic block organized by base pointer.
	StoreListMap Stores;

	/// The getelementptr instructions in a basic block organized by base pointer.
	WeakVHListMap GEPs;
	};
	}

	#endif // LLVM_TRANSFORMS_VECTORIZE_SLPVECTORIZER_H
	Index: projects/clang400-import/contrib/llvm/lib/Target/AMDGPU/SIInstructions.td
	===================================================================
	--- projects/clang400-import/contrib/llvm/lib/Target/AMDGPU/SIInstructions.td (revision 314268)
	+++ projects/clang400-import/contrib/llvm/lib/Target/AMDGPU/SIInstructions.td (revision 314269)
	@@ -1,1094 +1,1099 @@
	//===-- SIInstructions.td - SI Instruction Defintions ---------------------===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	// This file was originally auto-generated from a GPU register header file and
	// all the instruction definitions were originally commented out. Instructions
	// that are not yet supported remain commented out.
	//===----------------------------------------------------------------------===//

	def isGCN : Predicate<"Subtarget->getGeneration() "
	">= SISubtarget::SOUTHERN_ISLANDS">,
	AssemblerPredicate<"FeatureGCN">;
	def isSI : Predicate<"Subtarget->getGeneration() "
	"== SISubtarget::SOUTHERN_ISLANDS">,
	AssemblerPredicate<"FeatureSouthernIslands">;

	def has16BankLDS : Predicate<"Subtarget->getLDSBankCount() == 16">;
	def has32BankLDS : Predicate<"Subtarget->getLDSBankCount() == 32">;
	def HasVGPRIndexMode : Predicate<"Subtarget->hasVGPRIndexMode()">,
	AssemblerPredicate<"FeatureVGPRIndexMode">;
	def HasMovrel : Predicate<"Subtarget->hasMovrel()">,
	AssemblerPredicate<"FeatureMovrel">;

	include "VOPInstructions.td"
	include "SOPInstructions.td"
	include "SMInstructions.td"
	include "FLATInstructions.td"
	include "BUFInstructions.td"

	let SubtargetPredicate = isGCN in {

	//===----------------------------------------------------------------------===//
	// EXP Instructions
	//===----------------------------------------------------------------------===//

	defm EXP : EXP_m<0, AMDGPUexport>;
	defm EXP_DONE : EXP_m<1, AMDGPUexport_done>;

	//===----------------------------------------------------------------------===//
	// VINTRP Instructions
	//===----------------------------------------------------------------------===//

	let Uses = [M0, EXEC] in {

	// FIXME: Specify SchedRW for VINTRP insturctions.

	multiclass V_INTERP_P1_F32_m : VINTRP_m <
	0x00000000,
	(outs VGPR_32:$vdst),
	(ins VGPR_32:$vsrc, Attr:$attr, AttrChan:$attrchan),
	"v_interp_p1_f32 $vdst, $vsrc, $attr$attrchan",
	[(set f32:$vdst, (AMDGPUinterp_p1 f32:$vsrc, (i32 imm:$attrchan),
	(i32 imm:$attr)))]
	>;

	let OtherPredicates = [has32BankLDS] in {

	defm V_INTERP_P1_F32 : V_INTERP_P1_F32_m;

	} // End OtherPredicates = [has32BankLDS]

	let OtherPredicates = [has16BankLDS], Constraints = "@earlyclobber $vdst", isAsmParserOnly=1 in {

	defm V_INTERP_P1_F32_16bank : V_INTERP_P1_F32_m;

	} // End OtherPredicates = [has32BankLDS], Constraints = "@earlyclobber $vdst", isAsmParserOnly=1

	let DisableEncoding = "$src0", Constraints = "$src0 = $vdst" in {

	defm V_INTERP_P2_F32 : VINTRP_m <
	0x00000001,
	(outs VGPR_32:$vdst),
	(ins VGPR_32:$src0, VGPR_32:$vsrc, Attr:$attr, AttrChan:$attrchan),
	"v_interp_p2_f32 $vdst, $vsrc, $attr$attrchan",
	[(set f32:$vdst, (AMDGPUinterp_p2 f32:$src0, f32:$vsrc, (i32 imm:$attrchan),
	(i32 imm:$attr)))]>;

	} // End DisableEncoding = "$src0", Constraints = "$src0 = $vdst"

	defm V_INTERP_MOV_F32 : VINTRP_m <
	0x00000002,
	(outs VGPR_32:$vdst),
	(ins InterpSlot:$vsrc, Attr:$attr, AttrChan:$attrchan),
	"v_interp_mov_f32 $vdst, $vsrc, $attr$attrchan",
	[(set f32:$vdst, (AMDGPUinterp_mov (i32 imm:$vsrc), (i32 imm:$attrchan),
	(i32 imm:$attr)))]>;

	} // End Uses = [M0, EXEC]

	//===----------------------------------------------------------------------===//
	// Pseudo Instructions
	//===----------------------------------------------------------------------===//

	let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] in {

	// For use in patterns
	def V_CNDMASK_B64_PSEUDO : VOP3Common <(outs VReg_64:$vdst),
	(ins VSrc_b64:$src0, VSrc_b64:$src1, SSrc_b64:$src2), "", []> {
	let isPseudo = 1;
	let isCodeGenOnly = 1;
	let usesCustomInserter = 1;
	}

	// 64-bit vector move instruction. This is mainly used by the SIFoldOperands
	// pass to enable folding of inline immediates.
	def V_MOV_B64_PSEUDO : VPseudoInstSI <(outs VReg_64:$vdst),
	(ins VSrc_b64:$src0)>;
	} // End let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC]

	let usesCustomInserter = 1, SALU = 1 in {
	def GET_GROUPSTATICSIZE : PseudoInstSI <(outs SReg_32:$sdst), (ins),
	[(set SReg_32:$sdst, (int_amdgcn_groupstaticsize))]>;
	} // End let usesCustomInserter = 1, SALU = 1

	def S_MOV_B64_term : PseudoInstSI<(outs SReg_64:$dst),
	(ins SSrc_b64:$src0)> {
	let SALU = 1;
	let isAsCheapAsAMove = 1;
	let isTerminator = 1;
	}

	def S_XOR_B64_term : PseudoInstSI<(outs SReg_64:$dst),
	(ins SSrc_b64:$src0, SSrc_b64:$src1)> {
	let SALU = 1;
	let isAsCheapAsAMove = 1;
	let isTerminator = 1;
	}

	def S_ANDN2_B64_term : PseudoInstSI<(outs SReg_64:$dst),
	(ins SSrc_b64:$src0, SSrc_b64:$src1)> {
	let SALU = 1;
	let isAsCheapAsAMove = 1;
	let isTerminator = 1;
	}

	def WAVE_BARRIER : SPseudoInstSI<(outs), (ins),
	[(int_amdgcn_wave_barrier)]> {
	let SchedRW = [];
	let hasNoSchedulingInfo = 1;
	let hasSideEffects = 1;
	let mayLoad = 1;
	let mayStore = 1;
	let isBarrier = 1;
	let isConvergent = 1;
	}

	// SI pseudo instructions. These are used by the CFG structurizer pass
	// and should be lowered to ISA instructions prior to codegen.

	// Dummy terminator instruction to use after control flow instructions
	// replaced with exec mask operations.
	def SI_MASK_BRANCH : PseudoInstSI <
	(outs), (ins brtarget:$target)> {
	let isBranch = 0;
	let isTerminator = 1;
	let isBarrier = 0;
	let Uses = [EXEC];
	let SchedRW = [];
	let hasNoSchedulingInfo = 1;
	}

	let isTerminator = 1 in {

	def SI_IF: CFPseudoInstSI <
	(outs SReg_64:$dst), (ins SReg_64:$vcc, brtarget:$target),
	[(set i64:$dst, (int_amdgcn_if i1:$vcc, bb:$target))], 1, 1> {
	let Constraints = "";
	let Size = 12;
	let mayLoad = 1;
	let mayStore = 1;
	let hasSideEffects = 1;
	}

	def SI_ELSE : CFPseudoInstSI <
	(outs SReg_64:$dst), (ins SReg_64:$src, brtarget:$target, i1imm:$execfix), [], 1, 1> {
	let Constraints = "$src = $dst";
	let Size = 12;
	let mayStore = 1;
	let mayLoad = 1;
	let hasSideEffects = 1;
	}

	def SI_LOOP : CFPseudoInstSI <
	(outs), (ins SReg_64:$saved, brtarget:$target),
	[(int_amdgcn_loop i64:$saved, bb:$target)], 1, 1> {
	let Size = 8;
	let isBranch = 1;
	let hasSideEffects = 1;
	let mayLoad = 1;
	let mayStore = 1;
	}

	} // End isBranch = 1, isTerminator = 1

	def SI_END_CF : CFPseudoInstSI <
	(outs), (ins SReg_64:$saved),
	[(int_amdgcn_end_cf i64:$saved)], 1, 1> {
	let Size = 4;
	let isAsCheapAsAMove = 1;
	let isReMaterializable = 1;
	let mayLoad = 1;
	let mayStore = 1;
	let hasSideEffects = 1;
	}

	def SI_BREAK : CFPseudoInstSI <
	(outs SReg_64:$dst), (ins SReg_64:$src),
	[(set i64:$dst, (int_amdgcn_break i64:$src))], 1> {
	let Size = 4;
	let isAsCheapAsAMove = 1;
	let isReMaterializable = 1;
	}

	def SI_IF_BREAK : CFPseudoInstSI <
	(outs SReg_64:$dst), (ins SReg_64:$vcc, SReg_64:$src),
	[(set i64:$dst, (int_amdgcn_if_break i1:$vcc, i64:$src))]> {
	let Size = 4;
	let isAsCheapAsAMove = 1;
	let isReMaterializable = 1;
	}

	def SI_ELSE_BREAK : CFPseudoInstSI <
	(outs SReg_64:$dst), (ins SReg_64:$src0, SReg_64:$src1),
	[(set i64:$dst, (int_amdgcn_else_break i64:$src0, i64:$src1))]> {
	let Size = 4;
	let isAsCheapAsAMove = 1;
	let isReMaterializable = 1;
	}

	let Uses = [EXEC], Defs = [EXEC,VCC] in {
	def SI_KILL : PseudoInstSI <
	(outs), (ins VSrc_b32:$src),
	[(AMDGPUkill i32:$src)]> {
	let isConvergent = 1;
	let usesCustomInserter = 1;
	}

	def SI_KILL_TERMINATOR : SPseudoInstSI <
	(outs), (ins VSrc_b32:$src)> {
	let isTerminator = 1;
	}

	} // End Uses = [EXEC], Defs = [EXEC,VCC]

	// Branch on undef scc. Used to avoid intermediate copy from
	// IMPLICIT_DEF to SCC.
	def SI_BR_UNDEF : SPseudoInstSI <(outs), (ins sopp_brtarget:$simm16)> {
	let isTerminator = 1;
	let usesCustomInserter = 1;
	}

	def SI_PS_LIVE : PseudoInstSI <
	(outs SReg_64:$dst), (ins),
	[(set i1:$dst, (int_amdgcn_ps_live))]> {
	let SALU = 1;
	}

	// Used as an isel pseudo to directly emit initialization with an
	// s_mov_b32 rather than a copy of another initialized
	// register. MachineCSE skips copies, and we don't want to have to
	// fold operands before it runs.
	def SI_INIT_M0 : SPseudoInstSI <(outs), (ins SSrc_b32:$src)> {
	let Defs = [M0];
	let usesCustomInserter = 1;
	let isAsCheapAsAMove = 1;
	let isReMaterializable = 1;
	}

	def SI_RETURN : SPseudoInstSI <
	(outs), (ins variable_ops), [(AMDGPUreturn)]> {
	let isTerminator = 1;
	let isBarrier = 1;
	let isReturn = 1;
	let hasSideEffects = 1;
	let hasNoSchedulingInfo = 1;
	let DisableWQM = 1;
	}

	let Defs = [M0, EXEC],
	UseNamedOperandTable = 1 in {

	class SI_INDIRECT_SRC<RegisterClass rc> : VPseudoInstSI <
	(outs VGPR_32:$vdst),
	(ins rc:$src, VS_32:$idx, i32imm:$offset)> {
	let usesCustomInserter = 1;
	}

	class SI_INDIRECT_DST<RegisterClass rc> : VPseudoInstSI <
	(outs rc:$vdst),
	(ins rc:$src, VS_32:$idx, i32imm:$offset, VGPR_32:$val)> {
	let Constraints = "$src = $vdst";
	let usesCustomInserter = 1;
	}

	// TODO: We can support indirect SGPR access.
	def SI_INDIRECT_SRC_V1 : SI_INDIRECT_SRC<VGPR_32>;
	def SI_INDIRECT_SRC_V2 : SI_INDIRECT_SRC<VReg_64>;
	def SI_INDIRECT_SRC_V4 : SI_INDIRECT_SRC<VReg_128>;
	def SI_INDIRECT_SRC_V8 : SI_INDIRECT_SRC<VReg_256>;
	def SI_INDIRECT_SRC_V16 : SI_INDIRECT_SRC<VReg_512>;

	def SI_INDIRECT_DST_V1 : SI_INDIRECT_DST<VGPR_32>;
	def SI_INDIRECT_DST_V2 : SI_INDIRECT_DST<VReg_64>;
	def SI_INDIRECT_DST_V4 : SI_INDIRECT_DST<VReg_128>;
	def SI_INDIRECT_DST_V8 : SI_INDIRECT_DST<VReg_256>;
	def SI_INDIRECT_DST_V16 : SI_INDIRECT_DST<VReg_512>;

	} // End Uses = [EXEC], Defs = [M0, EXEC]

	multiclass SI_SPILL_SGPR <RegisterClass sgpr_class> {
	let UseNamedOperandTable = 1, SGPRSpill = 1, Uses = [EXEC] in {
	def _SAVE : PseudoInstSI <
	(outs),
	(ins sgpr_class:$data, i32imm:$addr)> {
	let mayStore = 1;
	let mayLoad = 0;
	}

	def _RESTORE : PseudoInstSI <
	(outs sgpr_class:$data),
	(ins i32imm:$addr)> {
	let mayStore = 0;
	let mayLoad = 1;
	}
	} // End UseNamedOperandTable = 1
	}

	// You cannot use M0 as the output of v_readlane_b32 instructions or
	// use it in the sdata operand of SMEM instructions. We still need to
	// be able to spill the physical register m0, so allow it for
	// SI_SPILL_32_* instructions.
	defm SI_SPILL_S32 : SI_SPILL_SGPR <SReg_32>;
	defm SI_SPILL_S64 : SI_SPILL_SGPR <SReg_64>;
	defm SI_SPILL_S128 : SI_SPILL_SGPR <SReg_128>;
	defm SI_SPILL_S256 : SI_SPILL_SGPR <SReg_256>;
	defm SI_SPILL_S512 : SI_SPILL_SGPR <SReg_512>;

	multiclass SI_SPILL_VGPR <RegisterClass vgpr_class> {
	let UseNamedOperandTable = 1, VGPRSpill = 1,
	SchedRW = [WriteVMEM] in {
	def _SAVE : VPseudoInstSI <
	(outs),
	(ins vgpr_class:$vdata, i32imm:$vaddr, SReg_128:$srsrc,
	SReg_32:$soffset, i32imm:$offset)> {
	let mayStore = 1;
	let mayLoad = 0;
	// (2 * 4) + (8 * num_subregs) bytes maximum
	let Size = !add(!shl(!srl(vgpr_class.Size, 5), 3), 8);
	}

	def _RESTORE : VPseudoInstSI <
	(outs vgpr_class:$vdata),
	(ins i32imm:$vaddr, SReg_128:$srsrc, SReg_32:$soffset,
	i32imm:$offset)> {
	let mayStore = 0;
	let mayLoad = 1;

	// (2 * 4) + (8 * num_subregs) bytes maximum
	let Size = !add(!shl(!srl(vgpr_class.Size, 5), 3), 8);
	}
	} // End UseNamedOperandTable = 1, VGPRSpill = 1, SchedRW = [WriteVMEM]
	}

	defm SI_SPILL_V32 : SI_SPILL_VGPR <VGPR_32>;
	defm SI_SPILL_V64 : SI_SPILL_VGPR <VReg_64>;
	defm SI_SPILL_V96 : SI_SPILL_VGPR <VReg_96>;
	defm SI_SPILL_V128 : SI_SPILL_VGPR <VReg_128>;
	defm SI_SPILL_V256 : SI_SPILL_VGPR <VReg_256>;
	defm SI_SPILL_V512 : SI_SPILL_VGPR <VReg_512>;

	def SI_PC_ADD_REL_OFFSET : SPseudoInstSI <
	(outs SReg_64:$dst),
	(ins si_ga:$ptr_lo, si_ga:$ptr_hi),
	[(set SReg_64:$dst,
	(i64 (SIpc_add_rel_offset (tglobaladdr:$ptr_lo), (tglobaladdr:$ptr_hi))))]> {
	let Defs = [SCC];
	}

	} // End SubtargetPredicate = isGCN

	let Predicates = [isGCN] in {

	def : Pat<
	(int_amdgcn_else i64:$src, bb:$target),
	(SI_ELSE $src, $target, 0)
	>;

	def : Pat <
	(int_AMDGPU_kilp),
	(SI_KILL (i32 0xbf800000))
	>;

	//===----------------------------------------------------------------------===//
	// VOP1 Patterns
	//===----------------------------------------------------------------------===//

	let Predicates = [UnsafeFPMath] in {

	//def : RcpPat<V_RCP_F64_e32, f64>;
	//defm : RsqPat<V_RSQ_F64_e32, f64>;
	//defm : RsqPat<V_RSQ_F32_e32, f32>;

	def : RsqPat<V_RSQ_F32_e32, f32>;
	def : RsqPat<V_RSQ_F64_e32, f64>;

	// Convert (x - floor(x)) to fract(x)
	def : Pat <
	(f32 (fsub (f32 (VOP3Mods f32:$x, i32:$mods)),
	(f32 (ffloor (f32 (VOP3Mods f32:$x, i32:$mods)))))),
	(V_FRACT_F32_e64 $mods, $x, DSTCLAMP.NONE, DSTOMOD.NONE)
	>;

	// Convert (x + (-floor(x))) to fract(x)
	def : Pat <
	(f64 (fadd (f64 (VOP3Mods f64:$x, i32:$mods)),
	(f64 (fneg (f64 (ffloor (f64 (VOP3Mods f64:$x, i32:$mods)))))))),
	(V_FRACT_F64_e64 $mods, $x, DSTCLAMP.NONE, DSTOMOD.NONE)
	>;

	} // End Predicates = [UnsafeFPMath]

	def : Pat <
	(f32 (fpextend f16:$src)),
	(V_CVT_F32_F16_e32 $src)
	>;

	def : Pat <
	(f64 (fpextend f16:$src)),
	(V_CVT_F64_F32_e32 (V_CVT_F32_F16_e32 $src))
	>;

	def : Pat <
	(f16 (fpround f32:$src)),
	(V_CVT_F16_F32_e32 $src)
	>;

	def : Pat <
	(f16 (fpround f64:$src)),
	(V_CVT_F16_F32_e32 (V_CVT_F32_F64_e32 $src))
	>;

	def : Pat <
	(i32 (fp_to_sint f16:$src)),
	(V_CVT_I32_F32_e32 (V_CVT_F32_F16_e32 $src))
	>;

	def : Pat <
	(i32 (fp_to_uint f16:$src)),
	(V_CVT_U32_F32_e32 (V_CVT_F32_F16_e32 $src))
	>;

	def : Pat <
	(f16 (sint_to_fp i32:$src)),
	(V_CVT_F16_F32_e32 (V_CVT_F32_I32_e32 $src))
	>;

	def : Pat <
	(f16 (uint_to_fp i32:$src)),
	(V_CVT_F16_F32_e32 (V_CVT_F32_U32_e32 $src))
	>;

	//===----------------------------------------------------------------------===//
	// VOP2 Patterns
	//===----------------------------------------------------------------------===//

	multiclass FMADPat <ValueType vt, Instruction inst> {
	def : Pat <
	(vt (fmad (VOP3NoMods0 vt:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod),
	(VOP3NoMods vt:$src1, i32:$src1_modifiers),
	(VOP3NoMods vt:$src2, i32:$src2_modifiers))),
	(inst $src0_modifiers, $src0, $src1_modifiers, $src1,
	$src2_modifiers, $src2, $clamp, $omod)
	>;
	}

	defm : FMADPat <f16, V_MAC_F16_e64>;
	defm : FMADPat <f32, V_MAC_F32_e64>;

	multiclass SelectPat <ValueType vt, Instruction inst> {
	def : Pat <
	(vt (select i1:$src0, vt:$src1, vt:$src2)),
	(inst $src2, $src1, $src0)
	>;
	}

	defm : SelectPat <i16, V_CNDMASK_B32_e64>;
	defm : SelectPat <i32, V_CNDMASK_B32_e64>;
	defm : SelectPat <f16, V_CNDMASK_B32_e64>;
	defm : SelectPat <f32, V_CNDMASK_B32_e64>;

	def : Pat <
	(i32 (add (i32 (ctpop i32:$popcnt)), i32:$val)),
	(V_BCNT_U32_B32_e64 $popcnt, $val)
	>;

	/******** ============================================ ********/
	/******** Extraction, Insertion, Building and Casting ********/
	/******** ============================================ ********/

	foreach Index = 0-2 in {
	def Extract_Element_v2i32_#Index : Extract_Element <
	i32, v2i32, Index, !cast<SubRegIndex>(sub#Index)
	>;
	def Insert_Element_v2i32_#Index : Insert_Element <
	i32, v2i32, Index, !cast<SubRegIndex>(sub#Index)
	>;

	def Extract_Element_v2f32_#Index : Extract_Element <
	f32, v2f32, Index, !cast<SubRegIndex>(sub#Index)
	>;
	def Insert_Element_v2f32_#Index : Insert_Element <
	f32, v2f32, Index, !cast<SubRegIndex>(sub#Index)
	>;
	}

	foreach Index = 0-3 in {
	def Extract_Element_v4i32_#Index : Extract_Element <
	i32, v4i32, Index, !cast<SubRegIndex>(sub#Index)
	>;
	def Insert_Element_v4i32_#Index : Insert_Element <
	i32, v4i32, Index, !cast<SubRegIndex>(sub#Index)
	>;

	def Extract_Element_v4f32_#Index : Extract_Element <
	f32, v4f32, Index, !cast<SubRegIndex>(sub#Index)
	>;
	def Insert_Element_v4f32_#Index : Insert_Element <
	f32, v4f32, Index, !cast<SubRegIndex>(sub#Index)
	>;
	}

	foreach Index = 0-7 in {
	def Extract_Element_v8i32_#Index : Extract_Element <
	i32, v8i32, Index, !cast<SubRegIndex>(sub#Index)
	>;
	def Insert_Element_v8i32_#Index : Insert_Element <
	i32, v8i32, Index, !cast<SubRegIndex>(sub#Index)
	>;

	def Extract_Element_v8f32_#Index : Extract_Element <
	f32, v8f32, Index, !cast<SubRegIndex>(sub#Index)
	>;
	def Insert_Element_v8f32_#Index : Insert_Element <
	f32, v8f32, Index, !cast<SubRegIndex>(sub#Index)
	>;
	}

	foreach Index = 0-15 in {
	def Extract_Element_v16i32_#Index : Extract_Element <
	i32, v16i32, Index, !cast<SubRegIndex>(sub#Index)
	>;
	def Insert_Element_v16i32_#Index : Insert_Element <
	i32, v16i32, Index, !cast<SubRegIndex>(sub#Index)
	>;

	def Extract_Element_v16f32_#Index : Extract_Element <
	f32, v16f32, Index, !cast<SubRegIndex>(sub#Index)
	>;
	def Insert_Element_v16f32_#Index : Insert_Element <
	f32, v16f32, Index, !cast<SubRegIndex>(sub#Index)
	>;
	}

	// FIXME: Why do only some of these type combinations for SReg and
	// VReg?
	// 16-bit bitcast
	def : BitConvert <i16, f16, VGPR_32>;
	def : BitConvert <f16, i16, VGPR_32>;
	def : BitConvert <i16, f16, SReg_32>;
	def : BitConvert <f16, i16, SReg_32>;

	// 32-bit bitcast
	def : BitConvert <i32, f32, VGPR_32>;
	def : BitConvert <f32, i32, VGPR_32>;
	def : BitConvert <i32, f32, SReg_32>;
	def : BitConvert <f32, i32, SReg_32>;

	// 64-bit bitcast
	def : BitConvert <i64, f64, VReg_64>;
	def : BitConvert <f64, i64, VReg_64>;
	def : BitConvert <v2i32, v2f32, VReg_64>;
	def : BitConvert <v2f32, v2i32, VReg_64>;
	def : BitConvert <i64, v2i32, VReg_64>;
	def : BitConvert <v2i32, i64, VReg_64>;
	def : BitConvert <i64, v2f32, VReg_64>;
	def : BitConvert <v2f32, i64, VReg_64>;
	def : BitConvert <f64, v2f32, VReg_64>;
	def : BitConvert <v2f32, f64, VReg_64>;
	def : BitConvert <f64, v2i32, VReg_64>;
	def : BitConvert <v2i32, f64, VReg_64>;
	def : BitConvert <v4i32, v4f32, VReg_128>;
	def : BitConvert <v4f32, v4i32, VReg_128>;

	// 128-bit bitcast
	def : BitConvert <v2i64, v4i32, SReg_128>;
	def : BitConvert <v4i32, v2i64, SReg_128>;
	def : BitConvert <v2f64, v4f32, VReg_128>;
	def : BitConvert <v2f64, v4i32, VReg_128>;
	def : BitConvert <v4f32, v2f64, VReg_128>;
	def : BitConvert <v4i32, v2f64, VReg_128>;
	def : BitConvert <v2i64, v2f64, VReg_128>;
	def : BitConvert <v2f64, v2i64, VReg_128>;

	// 256-bit bitcast
	def : BitConvert <v8i32, v8f32, SReg_256>;
	def : BitConvert <v8f32, v8i32, SReg_256>;
	def : BitConvert <v8i32, v8f32, VReg_256>;
	def : BitConvert <v8f32, v8i32, VReg_256>;

	// 512-bit bitcast
	def : BitConvert <v16i32, v16f32, VReg_512>;
	def : BitConvert <v16f32, v16i32, VReg_512>;

	/******** =================== ********/
	/******** Src & Dst modifiers ********/
	/******** =================== ********/

	def : Pat <
	(AMDGPUclamp (VOP3Mods0Clamp f32:$src0, i32:$src0_modifiers, i32:$omod),
	(f32 FP_ZERO), (f32 FP_ONE)),
	(V_ADD_F32_e64 $src0_modifiers, $src0, 0, (i32 0), 1, $omod)
	>;

	/******** ================================ ********/
	/******** Floating point absolute/negative ********/
	/******** ================================ ********/

	// Prevent expanding both fneg and fabs.

	def : Pat <
	(fneg (fabs f32:$src)),
	(S_OR_B32 $src, (S_MOV_B32(i32 0x80000000))) // Set sign bit
	>;

	// FIXME: Should use S_OR_B32
	def : Pat <
	(fneg (fabs f64:$src)),
	(REG_SEQUENCE VReg_64,
	(i32 (EXTRACT_SUBREG f64:$src, sub0)),
	sub0,
	(V_OR_B32_e32 (i32 (EXTRACT_SUBREG f64:$src, sub1)),
	(V_MOV_B32_e32 (i32 0x80000000))), // Set sign bit.
	sub1)
	>;

	def : Pat <
	(fabs f32:$src),
	(V_AND_B32_e64 $src, (V_MOV_B32_e32 (i32 0x7fffffff)))
	>;

	def : Pat <
	(fneg f32:$src),
	(V_XOR_B32_e32 $src, (V_MOV_B32_e32 (i32 0x80000000)))
	>;

	def : Pat <
	(fabs f64:$src),
	(REG_SEQUENCE VReg_64,
	(i32 (EXTRACT_SUBREG f64:$src, sub0)),
	sub0,
	(V_AND_B32_e64 (i32 (EXTRACT_SUBREG f64:$src, sub1)),
	(V_MOV_B32_e32 (i32 0x7fffffff))), // Set sign bit.
	sub1)
	>;

	def : Pat <
	(fneg f64:$src),
	(REG_SEQUENCE VReg_64,
	(i32 (EXTRACT_SUBREG f64:$src, sub0)),
	sub0,
	(V_XOR_B32_e32 (i32 (EXTRACT_SUBREG f64:$src, sub1)),
	(i32 (V_MOV_B32_e32 (i32 0x80000000)))),
	sub1)
	>;

	def : Pat <
	(fneg f16:$src),
	(V_XOR_B32_e32 $src, (V_MOV_B32_e32 (i32 0x00008000)))
	>;

	def : Pat <
	(fabs f16:$src),
	(V_AND_B32_e64 $src, (V_MOV_B32_e32 (i32 0x00007fff)))
	>;

	def : Pat <
	(fneg (fabs f16:$src)),
	(S_OR_B32 $src, (S_MOV_B32 (i32 0x00008000))) // Set sign bit
	>;

	/******** ================== ********/
	/******** Immediate Patterns ********/
	/******** ================== ********/

	def : Pat <
	(VGPRImm<(i32 imm)>:$imm),
	(V_MOV_B32_e32 imm:$imm)
	>;

	def : Pat <
	(VGPRImm<(f32 fpimm)>:$imm),
	(V_MOV_B32_e32 (f32 (bitcast_fpimm_to_i32 $imm)))
	>;

	def : Pat <
	(i32 imm:$imm),
	(S_MOV_B32 imm:$imm)
	>;

	// FIXME: Workaround for ordering issue with peephole optimizer where
	// a register class copy interferes with immediate folding. Should
	// use s_mov_b32, which can be shrunk to s_movk_i32
	def : Pat <
	(VGPRImm<(f16 fpimm)>:$imm),
	(V_MOV_B32_e32 (f16 (bitcast_fpimm_to_i32 $imm)))
	>;

	def : Pat <
	(f32 fpimm:$imm),
	(S_MOV_B32 (f32 (bitcast_fpimm_to_i32 $imm)))
	>;

	def : Pat <
	(f16 fpimm:$imm),
	(S_MOV_B32 (i32 (bitcast_fpimm_to_i32 $imm)))
	>;

	def : Pat <
	(i32 frameindex:$fi),
	(V_MOV_B32_e32 (i32 (frameindex_to_targetframeindex $fi)))
	>;

	def : Pat <
	(i64 InlineImm<i64>:$imm),
	(S_MOV_B64 InlineImm<i64>:$imm)
	>;

	// XXX - Should this use a s_cmp to set SCC?

	// Set to sign-extended 64-bit value (true = -1, false = 0)
	def : Pat <
	(i1 imm:$imm),
	(S_MOV_B64 (i64 (as_i64imm $imm)))
	>;

	def : Pat <
	(f64 InlineFPImm<f64>:$imm),
	(S_MOV_B64 (f64 (bitcast_fpimm_to_i64 InlineFPImm<f64>:$imm)))
	>;

	/******** ================== ********/
	/******** Intrinsic Patterns ********/
	/******** ================== ********/

	def : POW_Common <V_LOG_F32_e32, V_EXP_F32_e32, V_MUL_LEGACY_F32_e32>;

	def : Pat <
	(int_AMDGPU_cube v4f32:$src),
	(REG_SEQUENCE VReg_128,
	(V_CUBETC_F32 0 /* src0_modifiers */, (f32 (EXTRACT_SUBREG $src, sub0)),
	0 /* src1_modifiers */, (f32 (EXTRACT_SUBREG $src, sub1)),
	0 /* src2_modifiers */, (f32 (EXTRACT_SUBREG $src, sub2)),
	0 /* clamp /, 0 / omod */), sub0,
	(V_CUBESC_F32 0 /* src0_modifiers */, (f32 (EXTRACT_SUBREG $src, sub0)),
	0 /* src1_modifiers */,(f32 (EXTRACT_SUBREG $src, sub1)),
	0 /* src2_modifiers */,(f32 (EXTRACT_SUBREG $src, sub2)),
	0 /* clamp /, 0 / omod */), sub1,
	(V_CUBEMA_F32 0 /* src1_modifiers */,(f32 (EXTRACT_SUBREG $src, sub0)),
	0 /* src1_modifiers */,(f32 (EXTRACT_SUBREG $src, sub1)),
	0 /* src1_modifiers */,(f32 (EXTRACT_SUBREG $src, sub2)),
	0 /* clamp /, 0 / omod */), sub2,
	(V_CUBEID_F32 0 /* src1_modifiers */,(f32 (EXTRACT_SUBREG $src, sub0)),
	0 /* src1_modifiers */,(f32 (EXTRACT_SUBREG $src, sub1)),
	0 /* src1_modifiers */,(f32 (EXTRACT_SUBREG $src, sub2)),
	0 /* clamp /, 0 / omod */), sub3)
	>;

	def : Pat <
	(i32 (sext i1:$src0)),
	(V_CNDMASK_B32_e64 (i32 0), (i32 -1), $src0)
	>;

	class Ext32Pat <SDNode ext> : Pat <
	(i32 (ext i1:$src0)),
	(V_CNDMASK_B32_e64 (i32 0), (i32 1), $src0)
	>;

	def : Ext32Pat <zext>;
	def : Ext32Pat <anyext>;

	// The multiplication scales from [0,1] to the unsigned integer range
	def : Pat <
	(AMDGPUurecip i32:$src0),
	(V_CVT_U32_F32_e32
	(V_MUL_F32_e32 (i32 CONST.FP_UINT_MAX_PLUS_1),
	(V_RCP_IFLAG_F32_e32 (V_CVT_F32_U32_e32 $src0))))
	>;

	//===----------------------------------------------------------------------===//
	// VOP3 Patterns
	//===----------------------------------------------------------------------===//

	def : IMad24Pat<V_MAD_I32_I24>;
	def : UMad24Pat<V_MAD_U32_U24>;

	defm : BFIPatterns <V_BFI_B32, S_MOV_B32, SReg_64>;
	def : ROTRPattern <V_ALIGNBIT_B32>;

	/******** ====================== ********/
	/******** Indirect addressing ********/
	/******** ====================== ********/

	multiclass SI_INDIRECT_Pattern <ValueType vt, ValueType eltvt, string VecSize> {
	// Extract with offset
	def : Pat<
	(eltvt (extractelt vt:$src, (MOVRELOffset i32:$idx, (i32 imm:$offset)))),
	(!cast<Instruction>("SI_INDIRECT_SRC_"#VecSize) $src, $idx, imm:$offset)
	>;

	// Insert with offset
	def : Pat<
	(insertelt vt:$src, eltvt:$val, (MOVRELOffset i32:$idx, (i32 imm:$offset))),
	(!cast<Instruction>("SI_INDIRECT_DST_"#VecSize) $src, $idx, imm:$offset, $val)
	>;
	}

	defm : SI_INDIRECT_Pattern <v2f32, f32, "V2">;
	defm : SI_INDIRECT_Pattern <v4f32, f32, "V4">;
	defm : SI_INDIRECT_Pattern <v8f32, f32, "V8">;
	defm : SI_INDIRECT_Pattern <v16f32, f32, "V16">;

	defm : SI_INDIRECT_Pattern <v2i32, i32, "V2">;
	defm : SI_INDIRECT_Pattern <v4i32, i32, "V4">;
	defm : SI_INDIRECT_Pattern <v8i32, i32, "V8">;
	defm : SI_INDIRECT_Pattern <v16i32, i32, "V16">;

	//===----------------------------------------------------------------------===//
	// SAD Patterns
	//===----------------------------------------------------------------------===//

	def : Pat <
	(add (sub_oneuse (umax i32:$src0, i32:$src1),
	(umin i32:$src0, i32:$src1)),
	i32:$src2),
	(V_SAD_U32 $src0, $src1, $src2)
	>;

	def : Pat <
	(add (select_oneuse (i1 (setugt i32:$src0, i32:$src1)),
	(sub i32:$src0, i32:$src1),
	(sub i32:$src1, i32:$src0)),
	i32:$src2),
	(V_SAD_U32 $src0, $src1, $src2)
	>;

	//===----------------------------------------------------------------------===//
	// Conversion Patterns
	//===----------------------------------------------------------------------===//

	def : Pat<(i32 (sext_inreg i32:$src, i1)),
	(S_BFE_I32 i32:$src, (i32 65536))>; // 0 \| 1 << 16

	// Handle sext_inreg in i64
	def : Pat <
	(i64 (sext_inreg i64:$src, i1)),
	(S_BFE_I64 i64:$src, (i32 0x10000)) // 0 \| 1 << 16
	>;

	def : Pat <
	(i16 (sext_inreg i16:$src, i1)),
	(S_BFE_I32 $src, (i32 0x00010000)) // 0 \| 1 << 16
	>;

	def : Pat <
	(i16 (sext_inreg i16:$src, i8)),
	(S_BFE_I32 $src, (i32 0x80000)) // 0 \| 8 << 16
	>;

	def : Pat <
	(i64 (sext_inreg i64:$src, i8)),
	(S_BFE_I64 i64:$src, (i32 0x80000)) // 0 \| 8 << 16
	>;

	def : Pat <
	(i64 (sext_inreg i64:$src, i16)),
	(S_BFE_I64 i64:$src, (i32 0x100000)) // 0 \| 16 << 16
	>;

	def : Pat <
	(i64 (sext_inreg i64:$src, i32)),
	(S_BFE_I64 i64:$src, (i32 0x200000)) // 0 \| 32 << 16
	>;

	def : Pat <
	(i64 (zext i32:$src)),
	(REG_SEQUENCE SReg_64, $src, sub0, (S_MOV_B32 (i32 0)), sub1)
	>;

	def : Pat <
	(i64 (anyext i32:$src)),
	(REG_SEQUENCE SReg_64, $src, sub0, (i32 (IMPLICIT_DEF)), sub1)
	>;

	class ZExt_i64_i1_Pat <SDNode ext> : Pat <
	(i64 (ext i1:$src)),
	(REG_SEQUENCE VReg_64,
	(V_CNDMASK_B32_e64 (i32 0), (i32 1), $src), sub0,
	(S_MOV_B32 (i32 0)), sub1)
	>;


	def : ZExt_i64_i1_Pat<zext>;
	def : ZExt_i64_i1_Pat<anyext>;

	// FIXME: We need to use COPY_TO_REGCLASS to work-around the fact that
	// REG_SEQUENCE patterns don't support instructions with multiple outputs.
	def : Pat <
	(i64 (sext i32:$src)),
	(REG_SEQUENCE SReg_64, $src, sub0,
	(i32 (COPY_TO_REGCLASS (S_ASHR_I32 $src, (i32 31)), SReg_32_XM0)), sub1)
	>;

	def : Pat <
	(i64 (sext i1:$src)),
	(REG_SEQUENCE VReg_64,
	(V_CNDMASK_B32_e64 (i32 0), (i32 -1), $src), sub0,
	(V_CNDMASK_B32_e64 (i32 0), (i32 -1), $src), sub1)
	>;

	class FPToI1Pat<Instruction Inst, int KOne, ValueType kone_type, ValueType vt, SDPatternOperator fp_to_int> : Pat <
	(i1 (fp_to_int (vt (VOP3Mods vt:$src0, i32:$src0_modifiers)))),
	(i1 (Inst 0, (kone_type KOne), $src0_modifiers, $src0, DSTCLAMP.NONE, DSTOMOD.NONE))
	>;

	def : FPToI1Pat<V_CMP_EQ_F32_e64, CONST.FP32_ONE, i32, f32, fp_to_uint>;
	def : FPToI1Pat<V_CMP_EQ_F32_e64, CONST.FP32_NEG_ONE, i32, f32, fp_to_sint>;
	def : FPToI1Pat<V_CMP_EQ_F64_e64, CONST.FP64_ONE, i64, f64, fp_to_uint>;
	def : FPToI1Pat<V_CMP_EQ_F64_e64, CONST.FP64_NEG_ONE, i64, f64, fp_to_sint>;

	// If we need to perform a logical operation on i1 values, we need to
	// use vector comparisons since there is only one SCC register. Vector
	// comparisons still write to a pair of SGPRs, so treat these as
	// 64-bit comparisons. When legalizing SGPR copies, instructions
	// resulting in the copies from SCC to these instructions will be
	// moved to the VALU.
	def : Pat <
	(i1 (and i1:$src0, i1:$src1)),
	(S_AND_B64 $src0, $src1)
	>;

	def : Pat <
	(i1 (or i1:$src0, i1:$src1)),
	(S_OR_B64 $src0, $src1)
	>;

	def : Pat <
	(i1 (xor i1:$src0, i1:$src1)),
	(S_XOR_B64 $src0, $src1)
	>;

	def : Pat <
	(f32 (sint_to_fp i1:$src)),
	(V_CNDMASK_B32_e64 (i32 0), (i32 CONST.FP32_NEG_ONE), $src)
	>;

	def : Pat <
	(f32 (uint_to_fp i1:$src)),
	(V_CNDMASK_B32_e64 (i32 0), (i32 CONST.FP32_ONE), $src)
	>;

	def : Pat <
	(f64 (sint_to_fp i1:$src)),
	(V_CVT_F64_I32_e32 (V_CNDMASK_B32_e64 (i32 0), (i32 -1), $src))
	>;

	def : Pat <
	(f64 (uint_to_fp i1:$src)),
	(V_CVT_F64_U32_e32 (V_CNDMASK_B32_e64 (i32 0), (i32 1), $src))
	>;

	//===----------------------------------------------------------------------===//
	// Miscellaneous Patterns
	//===----------------------------------------------------------------------===//

	def : Pat <
	(i32 (trunc i64:$a)),
	(EXTRACT_SUBREG $a, sub0)
	>;

	def : Pat <
	(i1 (trunc i32:$a)),
	(V_CMP_EQ_U32_e64 (S_AND_B32 (i32 1), $a), (i32 1))
	>;

	def : Pat <
	+ (i1 (trunc i16:$a)),
	+ (V_CMP_EQ_U32_e64 (S_AND_B32 (i32 1), $a), (i32 1))
	+>;
	+
	+def : Pat <
	(i1 (trunc i64:$a)),
	(V_CMP_EQ_U32_e64 (S_AND_B32 (i32 1),
	(i32 (EXTRACT_SUBREG $a, sub0))), (i32 1))
	>;

	def : Pat <
	(i32 (bswap i32:$a)),
	(V_BFI_B32 (S_MOV_B32 (i32 0x00ff00ff)),
	(V_ALIGNBIT_B32 $a, $a, (i32 24)),
	(V_ALIGNBIT_B32 $a, $a, (i32 8)))
	>;

	multiclass BFMPatterns <ValueType vt, InstSI BFM, InstSI MOV> {
	def : Pat <
	(vt (shl (vt (add (vt (shl 1, vt:$a)), -1)), vt:$b)),
	(BFM $a, $b)
	>;

	def : Pat <
	(vt (add (vt (shl 1, vt:$a)), -1)),
	(BFM $a, (MOV (i32 0)))
	>;
	}

	defm : BFMPatterns <i32, S_BFM_B32, S_MOV_B32>;
	// FIXME: defm : BFMPatterns <i64, S_BFM_B64, S_MOV_B64>;

	def : BFEPattern <V_BFE_U32, S_MOV_B32>;

	def : Pat<
	(fcanonicalize f16:$src),
	(V_MUL_F16_e64 0, (i32 CONST.FP16_ONE), 0, $src, 0, 0)
	>;

	def : Pat<
	(fcanonicalize f32:$src),
	(V_MUL_F32_e64 0, (i32 CONST.FP32_ONE), 0, $src, 0, 0)
	>;

	def : Pat<
	(fcanonicalize f64:$src),
	(V_MUL_F64 0, CONST.FP64_ONE, 0, $src, 0, 0)
	>;

	//===----------------------------------------------------------------------===//
	// Fract Patterns
	//===----------------------------------------------------------------------===//

	let Predicates = [isSI] in {

	// V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) is
	// used instead. However, SI doesn't have V_FLOOR_F64, so the most efficient
	// way to implement it is using V_FRACT_F64.
	// The workaround for the V_FRACT bug is:
	// fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)

	// Convert floor(x) to (x - fract(x))
	def : Pat <
	(f64 (ffloor (f64 (VOP3Mods f64:$x, i32:$mods)))),
	(V_ADD_F64
	$mods,
	$x,
	SRCMODS.NEG,
	(V_CNDMASK_B64_PSEUDO
	(V_MIN_F64
	SRCMODS.NONE,
	(V_FRACT_F64_e64 $mods, $x, DSTCLAMP.NONE, DSTOMOD.NONE),
	SRCMODS.NONE,
	(V_MOV_B64_PSEUDO 0x3fefffffffffffff),
	DSTCLAMP.NONE, DSTOMOD.NONE),
	$x,
	(V_CMP_CLASS_F64_e64 SRCMODS.NONE, $x, (i32 3 /NaN/))),
	DSTCLAMP.NONE, DSTOMOD.NONE)
	>;

	} // End Predicates = [isSI]

	//============================================================================//
	// Miscellaneous Optimization Patterns
	//============================================================================//

	def : SHA256MaPattern <V_BFI_B32, V_XOR_B32_e64>;

	def : IntMed3Pat<V_MED3_I32, smax, smax_oneuse, smin_oneuse>;
	def : IntMed3Pat<V_MED3_U32, umax, umax_oneuse, umin_oneuse>;

	//============================================================================//
	// Assembler aliases
	//============================================================================//

	def : MnemonicAlias<"v_add_u32", "v_add_i32">;
	def : MnemonicAlias<"v_sub_u32", "v_sub_i32">;
	def : MnemonicAlias<"v_subrev_u32", "v_subrev_i32">;

	} // End isGCN predicate
	Index: projects/clang400-import/contrib/llvm/lib/Target/AMDGPU/VOP1Instructions.td
	===================================================================
	--- projects/clang400-import/contrib/llvm/lib/Target/AMDGPU/VOP1Instructions.td (revision 314268)
	+++ projects/clang400-import/contrib/llvm/lib/Target/AMDGPU/VOP1Instructions.td (revision 314269)
	@@ -1,621 +1,615 @@
	//===-- VOP1Instructions.td - Vector Instruction Defintions ---------------===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//

	//===----------------------------------------------------------------------===//
	// VOP1 Classes
	//===----------------------------------------------------------------------===//

	class VOP1e <bits<8> op, VOPProfile P> : Enc32 {
	bits<8> vdst;
	bits<9> src0;

	let Inst{8-0} = !if(P.HasSrc0, src0{8-0}, 0);
	let Inst{16-9} = op;
	let Inst{24-17} = !if(P.EmitDst, vdst{7-0}, 0);
	let Inst{31-25} = 0x3f; //encoding
	}

	class VOP1_SDWAe <bits<8> op, VOPProfile P> : VOP_SDWAe <P> {
	bits<8> vdst;

	let Inst{8-0} = 0xf9; // sdwa
	let Inst{16-9} = op;
	let Inst{24-17} = !if(P.EmitDst, vdst{7-0}, 0);
	let Inst{31-25} = 0x3f; // encoding
	}

	class VOP1_Pseudo <string opName, VOPProfile P, list<dag> pattern=[]> :
	InstSI <P.Outs32, P.Ins32, "", pattern>,
	VOP <opName>,
	SIMCInstr <opName#"_e32", SIEncodingFamily.NONE>,
	MnemonicAlias<opName#"_e32", opName> {

	let isPseudo = 1;
	let isCodeGenOnly = 1;
	let UseNamedOperandTable = 1;

	string Mnemonic = opName;
	string AsmOperands = P.Asm32;

	let Size = 4;
	let mayLoad = 0;
	let mayStore = 0;
	let hasSideEffects = 0;
	let SubtargetPredicate = isGCN;

	let VOP1 = 1;
	let VALU = 1;
	let Uses = [EXEC];

	let AsmVariantName = AMDGPUAsmVariants.Default;

	VOPProfile Pfl = P;
	}

	class VOP1_Real <VOP1_Pseudo ps, int EncodingFamily> :
	InstSI <ps.OutOperandList, ps.InOperandList, ps.Mnemonic # ps.AsmOperands, []>,
	SIMCInstr <ps.PseudoInstr, EncodingFamily> {

	let isPseudo = 0;
	let isCodeGenOnly = 0;

	let Constraints = ps.Constraints;
	let DisableEncoding = ps.DisableEncoding;

	// copy relevant pseudo op flags
	let SubtargetPredicate = ps.SubtargetPredicate;
	let AsmMatchConverter = ps.AsmMatchConverter;
	let AsmVariantName = ps.AsmVariantName;
	let Constraints = ps.Constraints;
	let DisableEncoding = ps.DisableEncoding;
	let TSFlags = ps.TSFlags;
	}

	class VOP1_SDWA_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> :
	VOP_SDWA_Pseudo <OpName, P, pattern> {
	let AsmMatchConverter = "cvtSdwaVOP1";
	}

	class getVOP1Pat64 <SDPatternOperator node, VOPProfile P> : LetDummies {
	list<dag> ret = !if(P.HasModifiers,
	[(set P.DstVT:$vdst, (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0,
	i32:$src0_modifiers, i1:$clamp, i32:$omod))))],
	[(set P.DstVT:$vdst, (node P.Src0VT:$src0))]);
	}

	multiclass VOP1Inst <string opName, VOPProfile P,
	SDPatternOperator node = null_frag> {
	def _e32 : VOP1_Pseudo <opName, P>;
	def _e64 : VOP3_Pseudo <opName, P, getVOP1Pat64<node, P>.ret>;
	def _sdwa : VOP1_SDWA_Pseudo <opName, P>;
	}

	//===----------------------------------------------------------------------===//
	// VOP1 Instructions
	//===----------------------------------------------------------------------===//

	let VOPAsmPrefer32Bit = 1 in {
	defm V_NOP : VOP1Inst <"v_nop", VOP_NONE>;
	}

	let isMoveImm = 1, isReMaterializable = 1, isAsCheapAsAMove = 1 in {
	defm V_MOV_B32 : VOP1Inst <"v_mov_b32", VOP_I32_I32>;
	} // End isMoveImm = 1

	// FIXME: Specify SchedRW for READFIRSTLANE_B32
	// TODO: Make profile for this, there is VOP3 encoding also
	def V_READFIRSTLANE_B32 :
	InstSI <(outs SReg_32:$vdst),
	(ins VGPR_32:$src0),
	"v_readfirstlane_b32 $vdst, $src0",
	[(set i32:$vdst, (int_amdgcn_readfirstlane i32:$src0))]>,
	Enc32 {

	let isCodeGenOnly = 0;
	let UseNamedOperandTable = 1;

	let Size = 4;
	let mayLoad = 0;
	let mayStore = 0;
	let hasSideEffects = 0;
	let SubtargetPredicate = isGCN;

	let VOP1 = 1;
	let VALU = 1;
	let Uses = [EXEC];
	let isConvergent = 1;

	bits<8> vdst;
	bits<9> src0;

	let Inst{8-0} = src0;
	let Inst{16-9} = 0x2;
	let Inst{24-17} = vdst;
	let Inst{31-25} = 0x3f; //encoding
	}

	let SchedRW = [WriteQuarterRate32] in {
	defm V_CVT_I32_F64 : VOP1Inst <"v_cvt_i32_f64", VOP_I32_F64, fp_to_sint>;
	defm V_CVT_F64_I32 : VOP1Inst <"v_cvt_f64_i32", VOP_F64_I32, sint_to_fp>;
	defm V_CVT_F32_I32 : VOP1Inst <"v_cvt_f32_i32", VOP_F32_I32, sint_to_fp>;
	defm V_CVT_F32_U32 : VOP1Inst <"v_cvt_f32_u32", VOP_F32_I32, uint_to_fp>;
	defm V_CVT_U32_F32 : VOP1Inst <"v_cvt_u32_f32", VOP_I32_F32, fp_to_uint>;
	defm V_CVT_I32_F32 : VOP1Inst <"v_cvt_i32_f32", VOP_I32_F32, fp_to_sint>;
	defm V_CVT_F16_F32 : VOP1Inst <"v_cvt_f16_f32", VOP_I32_F32, fp_to_f16>;
	defm V_CVT_F32_F16 : VOP1Inst <"v_cvt_f32_f16", VOP_F32_I32, f16_to_fp>;
	defm V_CVT_RPI_I32_F32 : VOP1Inst <"v_cvt_rpi_i32_f32", VOP_I32_F32, cvt_rpi_i32_f32>;
	defm V_CVT_FLR_I32_F32 : VOP1Inst <"v_cvt_flr_i32_f32", VOP_I32_F32, cvt_flr_i32_f32>;
	defm V_CVT_OFF_F32_I4 : VOP1Inst <"v_cvt_off_f32_i4", VOP_F32_I32>;
	defm V_CVT_F32_F64 : VOP1Inst <"v_cvt_f32_f64", VOP_F32_F64, fpround>;
	defm V_CVT_F64_F32 : VOP1Inst <"v_cvt_f64_f32", VOP_F64_F32, fpextend>;
	defm V_CVT_F32_UBYTE0 : VOP1Inst <"v_cvt_f32_ubyte0", VOP_F32_I32, AMDGPUcvt_f32_ubyte0>;
	defm V_CVT_F32_UBYTE1 : VOP1Inst <"v_cvt_f32_ubyte1", VOP_F32_I32, AMDGPUcvt_f32_ubyte1>;
	defm V_CVT_F32_UBYTE2 : VOP1Inst <"v_cvt_f32_ubyte2", VOP_F32_I32, AMDGPUcvt_f32_ubyte2>;
	defm V_CVT_F32_UBYTE3 : VOP1Inst <"v_cvt_f32_ubyte3", VOP_F32_I32, AMDGPUcvt_f32_ubyte3>;
	defm V_CVT_U32_F64 : VOP1Inst <"v_cvt_u32_f64", VOP_I32_F64, fp_to_uint>;
	defm V_CVT_F64_U32 : VOP1Inst <"v_cvt_f64_u32", VOP_F64_I32, uint_to_fp>;
	} // End SchedRW = [WriteQuarterRate32]

	defm V_FRACT_F32 : VOP1Inst <"v_fract_f32", VOP_F32_F32, AMDGPUfract>;
	defm V_TRUNC_F32 : VOP1Inst <"v_trunc_f32", VOP_F32_F32, ftrunc>;
	defm V_CEIL_F32 : VOP1Inst <"v_ceil_f32", VOP_F32_F32, fceil>;
	defm V_RNDNE_F32 : VOP1Inst <"v_rndne_f32", VOP_F32_F32, frint>;
	defm V_FLOOR_F32 : VOP1Inst <"v_floor_f32", VOP_F32_F32, ffloor>;
	defm V_EXP_F32 : VOP1Inst <"v_exp_f32", VOP_F32_F32, fexp2>;

	let SchedRW = [WriteQuarterRate32] in {
	defm V_LOG_F32 : VOP1Inst <"v_log_f32", VOP_F32_F32, flog2>;
	defm V_RCP_F32 : VOP1Inst <"v_rcp_f32", VOP_F32_F32, AMDGPUrcp>;
	defm V_RCP_IFLAG_F32 : VOP1Inst <"v_rcp_iflag_f32", VOP_F32_F32>;
	defm V_RSQ_F32 : VOP1Inst <"v_rsq_f32", VOP_F32_F32, AMDGPUrsq>;
	} // End SchedRW = [WriteQuarterRate32]

	let SchedRW = [WriteDouble] in {
	defm V_RCP_F64 : VOP1Inst <"v_rcp_f64", VOP_F64_F64, AMDGPUrcp>;
	defm V_RSQ_F64 : VOP1Inst <"v_rsq_f64", VOP_F64_F64, AMDGPUrsq>;
	} // End SchedRW = [WriteDouble];

	defm V_SQRT_F32 : VOP1Inst <"v_sqrt_f32", VOP_F32_F32, fsqrt>;

	let SchedRW = [WriteDouble] in {
	defm V_SQRT_F64 : VOP1Inst <"v_sqrt_f64", VOP_F64_F64, fsqrt>;
	} // End SchedRW = [WriteDouble]

	let SchedRW = [WriteQuarterRate32] in {
	defm V_SIN_F32 : VOP1Inst <"v_sin_f32", VOP_F32_F32, AMDGPUsin>;
	defm V_COS_F32 : VOP1Inst <"v_cos_f32", VOP_F32_F32, AMDGPUcos>;
	} // End SchedRW = [WriteQuarterRate32]

	defm V_NOT_B32 : VOP1Inst <"v_not_b32", VOP_I32_I32>;
	defm V_BFREV_B32 : VOP1Inst <"v_bfrev_b32", VOP_I32_I32>;
	defm V_FFBH_U32 : VOP1Inst <"v_ffbh_u32", VOP_I32_I32>;
	defm V_FFBL_B32 : VOP1Inst <"v_ffbl_b32", VOP_I32_I32>;
	defm V_FFBH_I32 : VOP1Inst <"v_ffbh_i32", VOP_I32_I32>;
	defm V_FREXP_EXP_I32_F64 : VOP1Inst <"v_frexp_exp_i32_f64", VOP_I32_F64, int_amdgcn_frexp_exp>;

	let SchedRW = [WriteDoubleAdd] in {
	defm V_FREXP_MANT_F64 : VOP1Inst <"v_frexp_mant_f64", VOP_F64_F64, int_amdgcn_frexp_mant>;
	defm V_FRACT_F64 : VOP1Inst <"v_fract_f64", VOP_F64_F64, AMDGPUfract>;
	} // End SchedRW = [WriteDoubleAdd]

	defm V_FREXP_EXP_I32_F32 : VOP1Inst <"v_frexp_exp_i32_f32", VOP_I32_F32, int_amdgcn_frexp_exp>;
	defm V_FREXP_MANT_F32 : VOP1Inst <"v_frexp_mant_f32", VOP_F32_F32, int_amdgcn_frexp_mant>;

	let VOPAsmPrefer32Bit = 1 in {
	defm V_CLREXCP : VOP1Inst <"v_clrexcp", VOP_NO_EXT<VOP_NONE>>;
	}

	// Restrict src0 to be VGPR
	def VOP_I32_VI32_NO_EXT : VOPProfile<[i32, i32, untyped, untyped]> {
	let Src0RC32 = VRegSrc_32;
	let Src0RC64 = VRegSrc_32;

	let HasExt = 0;
	}

	// Special case because there are no true output operands. Hack vdst
	// to be a src operand. The custom inserter must add a tied implicit
	// def and use of the super register since there seems to be no way to
	// add an implicit def of a virtual register in tablegen.
	def VOP_MOVRELD : VOPProfile<[untyped, i32, untyped, untyped]> {
	let Src0RC32 = VOPDstOperand<VGPR_32>;
	let Src0RC64 = VOPDstOperand<VGPR_32>;

	let Outs = (outs);
	let Ins32 = (ins Src0RC32:$vdst, VSrc_b32:$src0);
	let Ins64 = (ins Src0RC64:$vdst, VSrc_b32:$src0);
	let InsDPP = (ins Src0RC32:$vdst, Src0RC32:$src0, dpp_ctrl:$dpp_ctrl, row_mask:$row_mask,
	bank_mask:$bank_mask, bound_ctrl:$bound_ctrl);
	let InsSDWA = (ins Src0RC32:$vdst, Src0ModSDWA:$src0_modifiers, VCSrc_b32:$src0,
	clampmod:$clamp, dst_sel:$dst_sel, dst_unused:$dst_unused,
	src0_sel:$src0_sel);

	let Asm32 = getAsm32<1, 1>.ret;
	let Asm64 = getAsm64<1, 1, 0>.ret;
	let AsmDPP = getAsmDPP<1, 1, 0>.ret;
	let AsmSDWA = getAsmSDWA<1, 1, 0>.ret;

	let HasExt = 0;
	let HasDst = 0;
	let EmitDst = 1; // force vdst emission
	}

	let SubtargetPredicate = HasMovrel, Uses = [M0, EXEC] in {
	// v_movreld_b32 is a special case because the destination output
	// register is really a source. It isn't actually read (but may be
	// written), and is only to provide the base register to start
	// indexing from. Tablegen seems to not let you define an implicit
	// virtual register output for the super register being written into,
	// so this must have an implicit def of the register added to it.
	defm V_MOVRELD_B32 : VOP1Inst <"v_movreld_b32", VOP_MOVRELD>;
	defm V_MOVRELS_B32 : VOP1Inst <"v_movrels_b32", VOP_I32_VI32_NO_EXT>;
	defm V_MOVRELSD_B32 : VOP1Inst <"v_movrelsd_b32", VOP_NO_EXT<VOP_I32_I32>>;
	} // End Uses = [M0, EXEC]

	// These instruction only exist on SI and CI
	let SubtargetPredicate = isSICI in {

	let SchedRW = [WriteQuarterRate32] in {
	defm V_MOV_FED_B32 : VOP1Inst <"v_mov_fed_b32", VOP_I32_I32>;
	defm V_LOG_CLAMP_F32 : VOP1Inst <"v_log_clamp_f32", VOP_F32_F32, int_amdgcn_log_clamp>;
	defm V_RCP_CLAMP_F32 : VOP1Inst <"v_rcp_clamp_f32", VOP_F32_F32>;
	defm V_RCP_LEGACY_F32 : VOP1Inst <"v_rcp_legacy_f32", VOP_F32_F32, AMDGPUrcp_legacy>;
	defm V_RSQ_CLAMP_F32 : VOP1Inst <"v_rsq_clamp_f32", VOP_F32_F32, AMDGPUrsq_clamp>;
	defm V_RSQ_LEGACY_F32 : VOP1Inst <"v_rsq_legacy_f32", VOP_F32_F32, AMDGPUrsq_legacy>;
	} // End SchedRW = [WriteQuarterRate32]

	let SchedRW = [WriteDouble] in {
	defm V_RCP_CLAMP_F64 : VOP1Inst <"v_rcp_clamp_f64", VOP_F64_F64>;
	defm V_RSQ_CLAMP_F64 : VOP1Inst <"v_rsq_clamp_f64", VOP_F64_F64, AMDGPUrsq_clamp>;
	} // End SchedRW = [WriteDouble]

	} // End SubtargetPredicate = isSICI


	let SubtargetPredicate = isCIVI in {

	let SchedRW = [WriteDoubleAdd] in {
	defm V_TRUNC_F64 : VOP1Inst <"v_trunc_f64", VOP_F64_F64, ftrunc>;
	defm V_CEIL_F64 : VOP1Inst <"v_ceil_f64", VOP_F64_F64, fceil>;
	defm V_FLOOR_F64 : VOP1Inst <"v_floor_f64", VOP_F64_F64, ffloor>;
	defm V_RNDNE_F64 : VOP1Inst <"v_rndne_f64", VOP_F64_F64, frint>;
	} // End SchedRW = [WriteDoubleAdd]

	let SchedRW = [WriteQuarterRate32] in {
	defm V_LOG_LEGACY_F32 : VOP1Inst <"v_log_legacy_f32", VOP_F32_F32>;
	defm V_EXP_LEGACY_F32 : VOP1Inst <"v_exp_legacy_f32", VOP_F32_F32>;
	} // End SchedRW = [WriteQuarterRate32]

	} // End SubtargetPredicate = isCIVI


	let SubtargetPredicate = isVI in {

	defm V_CVT_F16_U16 : VOP1Inst <"v_cvt_f16_u16", VOP_F16_I16, uint_to_fp>;
	defm V_CVT_F16_I16 : VOP1Inst <"v_cvt_f16_i16", VOP_F16_I16, sint_to_fp>;
	defm V_CVT_U16_F16 : VOP1Inst <"v_cvt_u16_f16", VOP_I16_F16, fp_to_uint>;
	defm V_CVT_I16_F16 : VOP1Inst <"v_cvt_i16_f16", VOP_I16_F16, fp_to_sint>;
	defm V_RCP_F16 : VOP1Inst <"v_rcp_f16", VOP_F16_F16, AMDGPUrcp>;
	defm V_SQRT_F16 : VOP1Inst <"v_sqrt_f16", VOP_F16_F16, fsqrt>;
	defm V_RSQ_F16 : VOP1Inst <"v_rsq_f16", VOP_F16_F16, AMDGPUrsq>;
	defm V_LOG_F16 : VOP1Inst <"v_log_f16", VOP_F16_F16, flog2>;
	defm V_EXP_F16 : VOP1Inst <"v_exp_f16", VOP_F16_F16, fexp2>;
	defm V_FREXP_MANT_F16 : VOP1Inst <"v_frexp_mant_f16", VOP_F16_F16, int_amdgcn_frexp_mant>;
	defm V_FREXP_EXP_I16_F16 : VOP1Inst <"v_frexp_exp_i16_f16", VOP_I16_F16, int_amdgcn_frexp_exp>;
	defm V_FLOOR_F16 : VOP1Inst <"v_floor_f16", VOP_F16_F16, ffloor>;
	defm V_CEIL_F16 : VOP1Inst <"v_ceil_f16", VOP_F16_F16, fceil>;
	defm V_TRUNC_F16 : VOP1Inst <"v_trunc_f16", VOP_F16_F16, ftrunc>;
	defm V_RNDNE_F16 : VOP1Inst <"v_rndne_f16", VOP_F16_F16, frint>;
	defm V_FRACT_F16 : VOP1Inst <"v_fract_f16", VOP_F16_F16, AMDGPUfract>;
	defm V_SIN_F16 : VOP1Inst <"v_sin_f16", VOP_F16_F16, AMDGPUsin>;
	defm V_COS_F16 : VOP1Inst <"v_cos_f16", VOP_F16_F16, AMDGPUcos>;

	}

	let Predicates = [isVI] in {

	def : Pat<
	(f32 (f16_to_fp i16:$src)),
	(V_CVT_F32_F16_e32 $src)
	>;

	def : Pat<
	(i16 (fp_to_f16 f32:$src)),
	(V_CVT_F16_F32_e32 $src)
	>;

	}

	//===----------------------------------------------------------------------===//
	// Target
	//===----------------------------------------------------------------------===//

	//===----------------------------------------------------------------------===//
	// SI
	//===----------------------------------------------------------------------===//

	multiclass VOP1_Real_si <bits<9> op> {
	let AssemblerPredicates = [isSICI], DecoderNamespace = "SICI" in {
	def _e32_si :
	VOP1_Real<!cast<VOP1_Pseudo>(NAME#"_e32"), SIEncodingFamily.SI>,
	VOP1e<op{7-0}, !cast<VOP1_Pseudo>(NAME#"_e32").Pfl>;
	def _e64_si :
	VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.SI>,
	VOP3e_si <{1, 1, op{6-0}}, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl>;
	}
	}

	defm V_NOP : VOP1_Real_si <0x0>;
	defm V_MOV_B32 : VOP1_Real_si <0x1>;
	defm V_CVT_I32_F64 : VOP1_Real_si <0x3>;
	defm V_CVT_F64_I32 : VOP1_Real_si <0x4>;
	defm V_CVT_F32_I32 : VOP1_Real_si <0x5>;
	defm V_CVT_F32_U32 : VOP1_Real_si <0x6>;
	defm V_CVT_U32_F32 : VOP1_Real_si <0x7>;
	defm V_CVT_I32_F32 : VOP1_Real_si <0x8>;
	defm V_MOV_FED_B32 : VOP1_Real_si <0x9>;
	defm V_CVT_F16_F32 : VOP1_Real_si <0xa>;
	defm V_CVT_F32_F16 : VOP1_Real_si <0xb>;
	defm V_CVT_RPI_I32_F32 : VOP1_Real_si <0xc>;
	defm V_CVT_FLR_I32_F32 : VOP1_Real_si <0xd>;
	defm V_CVT_OFF_F32_I4 : VOP1_Real_si <0xe>;
	defm V_CVT_F32_F64 : VOP1_Real_si <0xf>;
	defm V_CVT_F64_F32 : VOP1_Real_si <0x10>;
	defm V_CVT_F32_UBYTE0 : VOP1_Real_si <0x11>;
	defm V_CVT_F32_UBYTE1 : VOP1_Real_si <0x12>;
	defm V_CVT_F32_UBYTE2 : VOP1_Real_si <0x13>;
	defm V_CVT_F32_UBYTE3 : VOP1_Real_si <0x14>;
	defm V_CVT_U32_F64 : VOP1_Real_si <0x15>;
	defm V_CVT_F64_U32 : VOP1_Real_si <0x16>;
	defm V_FRACT_F32 : VOP1_Real_si <0x20>;
	defm V_TRUNC_F32 : VOP1_Real_si <0x21>;
	defm V_CEIL_F32 : VOP1_Real_si <0x22>;
	defm V_RNDNE_F32 : VOP1_Real_si <0x23>;
	defm V_FLOOR_F32 : VOP1_Real_si <0x24>;
	defm V_EXP_F32 : VOP1_Real_si <0x25>;
	defm V_LOG_CLAMP_F32 : VOP1_Real_si <0x26>;
	defm V_LOG_F32 : VOP1_Real_si <0x27>;
	defm V_RCP_CLAMP_F32 : VOP1_Real_si <0x28>;
	defm V_RCP_LEGACY_F32 : VOP1_Real_si <0x29>;
	defm V_RCP_F32 : VOP1_Real_si <0x2a>;
	defm V_RCP_IFLAG_F32 : VOP1_Real_si <0x2b>;
	defm V_RSQ_CLAMP_F32 : VOP1_Real_si <0x2c>;
	defm V_RSQ_LEGACY_F32 : VOP1_Real_si <0x2d>;
	defm V_RSQ_F32 : VOP1_Real_si <0x2e>;
	defm V_RCP_F64 : VOP1_Real_si <0x2f>;
	defm V_RCP_CLAMP_F64 : VOP1_Real_si <0x30>;
	defm V_RSQ_F64 : VOP1_Real_si <0x31>;
	defm V_RSQ_CLAMP_F64 : VOP1_Real_si <0x32>;
	defm V_SQRT_F32 : VOP1_Real_si <0x33>;
	defm V_SQRT_F64 : VOP1_Real_si <0x34>;
	defm V_SIN_F32 : VOP1_Real_si <0x35>;
	defm V_COS_F32 : VOP1_Real_si <0x36>;
	defm V_NOT_B32 : VOP1_Real_si <0x37>;
	defm V_BFREV_B32 : VOP1_Real_si <0x38>;
	defm V_FFBH_U32 : VOP1_Real_si <0x39>;
	defm V_FFBL_B32 : VOP1_Real_si <0x3a>;
	defm V_FFBH_I32 : VOP1_Real_si <0x3b>;
	defm V_FREXP_EXP_I32_F64 : VOP1_Real_si <0x3c>;
	defm V_FREXP_MANT_F64 : VOP1_Real_si <0x3d>;
	defm V_FRACT_F64 : VOP1_Real_si <0x3e>;
	defm V_FREXP_EXP_I32_F32 : VOP1_Real_si <0x3f>;
	defm V_FREXP_MANT_F32 : VOP1_Real_si <0x40>;
	defm V_CLREXCP : VOP1_Real_si <0x41>;
	defm V_MOVRELD_B32 : VOP1_Real_si <0x42>;
	defm V_MOVRELS_B32 : VOP1_Real_si <0x43>;
	defm V_MOVRELSD_B32 : VOP1_Real_si <0x44>;

	//===----------------------------------------------------------------------===//
	// CI
	//===----------------------------------------------------------------------===//

	multiclass VOP1_Real_ci <bits<9> op> {
	let AssemblerPredicates = [isCIOnly], DecoderNamespace = "CI" in {
	def _e32_ci :
	VOP1_Real<!cast<VOP1_Pseudo>(NAME#"_e32"), SIEncodingFamily.SI>,
	VOP1e<op{7-0}, !cast<VOP1_Pseudo>(NAME#"_e32").Pfl>;
	def _e64_ci :
	VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.SI>,
	VOP3e_si <{1, 1, op{6-0}}, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl>;
	}
	}

	defm V_TRUNC_F64 : VOP1_Real_ci <0x17>;
	defm V_CEIL_F64 : VOP1_Real_ci <0x18>;
	defm V_FLOOR_F64 : VOP1_Real_ci <0x1A>;
	defm V_RNDNE_F64 : VOP1_Real_ci <0x19>;
	defm V_LOG_LEGACY_F32 : VOP1_Real_ci <0x45>;
	defm V_EXP_LEGACY_F32 : VOP1_Real_ci <0x46>;

	//===----------------------------------------------------------------------===//
	// VI
	//===----------------------------------------------------------------------===//

	class VOP1_DPP <bits<8> op, VOP1_Pseudo ps, VOPProfile P = ps.Pfl> :
	VOP_DPP <ps.OpName, P> {
	let Defs = ps.Defs;
	let Uses = ps.Uses;
	let SchedRW = ps.SchedRW;
	let hasSideEffects = ps.hasSideEffects;
	let Constraints = ps.Constraints;
	let DisableEncoding = ps.DisableEncoding;

	bits<8> vdst;
	let Inst{8-0} = 0xfa; // dpp
	let Inst{16-9} = op;
	let Inst{24-17} = !if(P.EmitDst, vdst{7-0}, 0);
	let Inst{31-25} = 0x3f; //encoding
	}

	multiclass VOP1_Real_vi <bits<10> op> {
	let AssemblerPredicates = [isVI], DecoderNamespace = "VI" in {
	def _e32_vi :
	VOP1_Real<!cast<VOP1_Pseudo>(NAME#"_e32"), SIEncodingFamily.VI>,
	VOP1e<op{7-0}, !cast<VOP1_Pseudo>(NAME#"_e32").Pfl>;
	def _e64_vi :
	VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.VI>,
	VOP3e_vi <!add(0x140, op), !cast<VOP3_Pseudo>(NAME#"_e64").Pfl>;
	}

	def _sdwa_vi :
	VOP_SDWA_Real <!cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa")>,
	VOP1_SDWAe <op{7-0}, !cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa").Pfl>;

	// For now left dpp only for asm/dasm
	// TODO: add corresponding pseudo
	def _dpp : VOP1_DPP<op{7-0}, !cast<VOP1_Pseudo>(NAME#"_e32")>;
	}

	defm V_NOP : VOP1_Real_vi <0x0>;
	defm V_MOV_B32 : VOP1_Real_vi <0x1>;
	defm V_CVT_I32_F64 : VOP1_Real_vi <0x3>;
	defm V_CVT_F64_I32 : VOP1_Real_vi <0x4>;
	defm V_CVT_F32_I32 : VOP1_Real_vi <0x5>;
	defm V_CVT_F32_U32 : VOP1_Real_vi <0x6>;
	defm V_CVT_U32_F32 : VOP1_Real_vi <0x7>;
	defm V_CVT_I32_F32 : VOP1_Real_vi <0x8>;
	defm V_CVT_F16_F32 : VOP1_Real_vi <0xa>;
	defm V_CVT_F32_F16 : VOP1_Real_vi <0xb>;
	defm V_CVT_RPI_I32_F32 : VOP1_Real_vi <0xc>;
	defm V_CVT_FLR_I32_F32 : VOP1_Real_vi <0xd>;
	defm V_CVT_OFF_F32_I4 : VOP1_Real_vi <0xe>;
	defm V_CVT_F32_F64 : VOP1_Real_vi <0xf>;
	defm V_CVT_F64_F32 : VOP1_Real_vi <0x10>;
	defm V_CVT_F32_UBYTE0 : VOP1_Real_vi <0x11>;
	defm V_CVT_F32_UBYTE1 : VOP1_Real_vi <0x12>;
	defm V_CVT_F32_UBYTE2 : VOP1_Real_vi <0x13>;
	defm V_CVT_F32_UBYTE3 : VOP1_Real_vi <0x14>;
	defm V_CVT_U32_F64 : VOP1_Real_vi <0x15>;
	defm V_CVT_F64_U32 : VOP1_Real_vi <0x16>;
	defm V_FRACT_F32 : VOP1_Real_vi <0x1b>;
	defm V_TRUNC_F32 : VOP1_Real_vi <0x1c>;
	defm V_CEIL_F32 : VOP1_Real_vi <0x1d>;
	defm V_RNDNE_F32 : VOP1_Real_vi <0x1e>;
	defm V_FLOOR_F32 : VOP1_Real_vi <0x1f>;
	defm V_EXP_F32 : VOP1_Real_vi <0x20>;
	defm V_LOG_F32 : VOP1_Real_vi <0x21>;
	defm V_RCP_F32 : VOP1_Real_vi <0x22>;
	defm V_RCP_IFLAG_F32 : VOP1_Real_vi <0x23>;
	defm V_RSQ_F32 : VOP1_Real_vi <0x24>;
	defm V_RCP_F64 : VOP1_Real_vi <0x25>;
	defm V_RSQ_F64 : VOP1_Real_vi <0x26>;
	defm V_SQRT_F32 : VOP1_Real_vi <0x27>;
	defm V_SQRT_F64 : VOP1_Real_vi <0x28>;
	defm V_SIN_F32 : VOP1_Real_vi <0x29>;
	defm V_COS_F32 : VOP1_Real_vi <0x2a>;
	defm V_NOT_B32 : VOP1_Real_vi <0x2b>;
	defm V_BFREV_B32 : VOP1_Real_vi <0x2c>;
	defm V_FFBH_U32 : VOP1_Real_vi <0x2d>;
	defm V_FFBL_B32 : VOP1_Real_vi <0x2e>;
	defm V_FFBH_I32 : VOP1_Real_vi <0x2f>;
	defm V_FREXP_EXP_I32_F64 : VOP1_Real_vi <0x30>;
	defm V_FREXP_MANT_F64 : VOP1_Real_vi <0x31>;
	defm V_FRACT_F64 : VOP1_Real_vi <0x32>;
	defm V_FREXP_EXP_I32_F32 : VOP1_Real_vi <0x33>;
	defm V_FREXP_MANT_F32 : VOP1_Real_vi <0x34>;
	defm V_CLREXCP : VOP1_Real_vi <0x35>;
	defm V_MOVRELD_B32 : VOP1_Real_vi <0x36>;
	defm V_MOVRELS_B32 : VOP1_Real_vi <0x37>;
	defm V_MOVRELSD_B32 : VOP1_Real_vi <0x38>;
	defm V_TRUNC_F64 : VOP1_Real_vi <0x17>;
	defm V_CEIL_F64 : VOP1_Real_vi <0x18>;
	defm V_FLOOR_F64 : VOP1_Real_vi <0x1A>;
	defm V_RNDNE_F64 : VOP1_Real_vi <0x19>;
	defm V_LOG_LEGACY_F32 : VOP1_Real_vi <0x4c>;
	defm V_EXP_LEGACY_F32 : VOP1_Real_vi <0x4b>;
	defm V_CVT_F16_U16 : VOP1_Real_vi <0x39>;
	defm V_CVT_F16_I16 : VOP1_Real_vi <0x3a>;
	defm V_CVT_U16_F16 : VOP1_Real_vi <0x3b>;
	defm V_CVT_I16_F16 : VOP1_Real_vi <0x3c>;
	defm V_RCP_F16 : VOP1_Real_vi <0x3d>;
	defm V_SQRT_F16 : VOP1_Real_vi <0x3e>;
	defm V_RSQ_F16 : VOP1_Real_vi <0x3f>;
	defm V_LOG_F16 : VOP1_Real_vi <0x40>;
	defm V_EXP_F16 : VOP1_Real_vi <0x41>;
	defm V_FREXP_MANT_F16 : VOP1_Real_vi <0x42>;
	defm V_FREXP_EXP_I16_F16 : VOP1_Real_vi <0x43>;
	defm V_FLOOR_F16 : VOP1_Real_vi <0x44>;
	defm V_CEIL_F16 : VOP1_Real_vi <0x45>;
	defm V_TRUNC_F16 : VOP1_Real_vi <0x46>;
	defm V_RNDNE_F16 : VOP1_Real_vi <0x47>;
	defm V_FRACT_F16 : VOP1_Real_vi <0x48>;
	defm V_SIN_F16 : VOP1_Real_vi <0x49>;
	defm V_COS_F16 : VOP1_Real_vi <0x4a>;


	// Copy of v_mov_b32 with $vdst as a use operand for use with VGPR
	// indexing mode. vdst can't be treated as a def for codegen purposes,
	// and an implicit use and def of the super register should be added.
	def V_MOV_B32_indirect : VPseudoInstSI<(outs),
	(ins getVALUDstForVT<i32>.ret:$vdst, getVOPSrc0ForVT<i32>.ret:$src0)>,
	PseudoInstExpansion<(V_MOV_B32_e32_vi getVALUDstForVT<i32>.ret:$vdst,
	getVOPSrc0ForVT<i32>.ret:$src0)> {
	let VOP1 = 1;
	let SubtargetPredicate = isVI;
	}

	// This is a pseudo variant of the v_movreld_b32 instruction in which the
	// vector operand appears only twice, once as def and once as use. Using this
	// pseudo avoids problems with the Two Address instructions pass.
	class V_MOVRELD_B32_pseudo<RegisterClass rc> : VPseudoInstSI <
	(outs rc:$vdst),
	(ins rc:$vsrc, VSrc_b32:$val, i32imm:$offset)> {
	let VOP1 = 1;

	let Constraints = "$vsrc = $vdst";
	let Uses = [M0, EXEC];

	let SubtargetPredicate = HasMovrel;
	}

	def V_MOVRELD_B32_V1 : V_MOVRELD_B32_pseudo<VGPR_32>;
	def V_MOVRELD_B32_V2 : V_MOVRELD_B32_pseudo<VReg_64>;
	def V_MOVRELD_B32_V4 : V_MOVRELD_B32_pseudo<VReg_128>;
	def V_MOVRELD_B32_V8 : V_MOVRELD_B32_pseudo<VReg_256>;
	def V_MOVRELD_B32_V16 : V_MOVRELD_B32_pseudo<VReg_512>;

	let Predicates = [isVI] in {

	def : Pat <
	(i32 (int_amdgcn_mov_dpp i32:$src, imm:$dpp_ctrl, imm:$row_mask, imm:$bank_mask,
	imm:$bound_ctrl)),
	(V_MOV_B32_dpp $src, (as_i32imm $dpp_ctrl), (as_i32imm $row_mask),
	(as_i32imm $bank_mask), (as_i1imm $bound_ctrl))
	>;


	def : Pat<
	(i32 (anyext i16:$src)),
	(COPY $src)
	>;

	def : Pat<
	(i64 (anyext i16:$src)),
	(REG_SEQUENCE VReg_64,
	(i32 (COPY $src)), sub0,
	(V_MOV_B32_e32 (i32 0)), sub1)
	>;

	def : Pat<
	(i16 (trunc i32:$src)),
	(COPY $src)
	>;

	-def : Pat<
	- (i1 (trunc i16:$src)),
	- (COPY $src)
	->;
	-
	-
	def : Pat <
	(i16 (trunc i64:$src)),
	(EXTRACT_SUBREG $src, sub0)
	>;

	} // End Predicates = [isVI]
	Index: projects/clang400-import/contrib/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
	===================================================================
	--- projects/clang400-import/contrib/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp (revision 314268)
	+++ projects/clang400-import/contrib/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp (revision 314269)
	@@ -1,577 +1,582 @@
	//===- CorrelatedValuePropagation.cpp - Propagate CFG-derived info --------===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	//
	// This file implements the Correlated Value Propagation pass.
	//
	//===----------------------------------------------------------------------===//

	#include "llvm/Transforms/Scalar/CorrelatedValuePropagation.h"
	#include "llvm/Transforms/Scalar.h"
	#include "llvm/ADT/Statistic.h"
	#include "llvm/Analysis/GlobalsModRef.h"
	#include "llvm/Analysis/InstructionSimplify.h"
	#include "llvm/Analysis/LazyValueInfo.h"
	#include "llvm/IR/CFG.h"
	#include "llvm/IR/ConstantRange.h"
	#include "llvm/IR/Constants.h"
	#include "llvm/IR/Function.h"
	#include "llvm/IR/Instructions.h"
	#include "llvm/IR/Module.h"
	#include "llvm/Pass.h"
	#include "llvm/Support/Debug.h"
	#include "llvm/Support/raw_ostream.h"
	#include "llvm/Transforms/Utils/Local.h"
	using namespace llvm;

	#define DEBUG_TYPE "correlated-value-propagation"

	STATISTIC(NumPhis, "Number of phis propagated");
	STATISTIC(NumSelects, "Number of selects propagated");
	STATISTIC(NumMemAccess, "Number of memory access targets propagated");
	STATISTIC(NumCmps, "Number of comparisons propagated");
	STATISTIC(NumReturns, "Number of return values propagated");
	STATISTIC(NumDeadCases, "Number of switch cases removed");
	STATISTIC(NumSDivs, "Number of sdiv converted to udiv");
	STATISTIC(NumAShrs, "Number of ashr converted to lshr");
	STATISTIC(NumSRems, "Number of srem converted to urem");

	+static cl::opt<bool> DontProcessAdds("cvp-dont-process-adds", cl::init(true));
	+
	namespace {
	class CorrelatedValuePropagation : public FunctionPass {
	public:
	static char ID;
	CorrelatedValuePropagation(): FunctionPass(ID) {
	initializeCorrelatedValuePropagationPass(*PassRegistry::getPassRegistry());
	}

	bool runOnFunction(Function &F) override;

	void getAnalysisUsage(AnalysisUsage &AU) const override {
	AU.addRequired<LazyValueInfoWrapperPass>();
	AU.addPreserved<GlobalsAAWrapperPass>();
	}
	};
	}

	char CorrelatedValuePropagation::ID = 0;
	INITIALIZE_PASS_BEGIN(CorrelatedValuePropagation, "correlated-propagation",
	"Value Propagation", false, false)
	INITIALIZE_PASS_DEPENDENCY(LazyValueInfoWrapperPass)
	INITIALIZE_PASS_END(CorrelatedValuePropagation, "correlated-propagation",
	"Value Propagation", false, false)

	// Public interface to the Value Propagation pass
	Pass *llvm::createCorrelatedValuePropagationPass() {
	return new CorrelatedValuePropagation();
	}

	static bool processSelect(SelectInst S, LazyValueInfo LVI) {
	if (S->getType()->isVectorTy()) return false;
	if (isa<Constant>(S->getOperand(0))) return false;

	Constant *C = LVI->getConstant(S->getOperand(0), S->getParent(), S);
	if (!C) return false;

	ConstantInt *CI = dyn_cast<ConstantInt>(C);
	if (!CI) return false;

	Value *ReplaceWith = S->getOperand(1);
	Value *Other = S->getOperand(2);
	if (!CI->isOne()) std::swap(ReplaceWith, Other);
	if (ReplaceWith == S) ReplaceWith = UndefValue::get(S->getType());

	S->replaceAllUsesWith(ReplaceWith);
	S->eraseFromParent();

	++NumSelects;

	return true;
	}

	static bool processPHI(PHINode P, LazyValueInfo LVI) {
	bool Changed = false;

	BasicBlock *BB = P->getParent();
	for (unsigned i = 0, e = P->getNumIncomingValues(); i < e; ++i) {
	Value *Incoming = P->getIncomingValue(i);
	if (isa<Constant>(Incoming)) continue;

	Value *V = LVI->getConstantOnEdge(Incoming, P->getIncomingBlock(i), BB, P);

	// Look if the incoming value is a select with a scalar condition for which
	// LVI can tells us the value. In that case replace the incoming value with
	// the appropriate value of the select. This often allows us to remove the
	// select later.
	if (!V) {
	SelectInst *SI = dyn_cast<SelectInst>(Incoming);
	if (!SI) continue;

	Value *Condition = SI->getCondition();
	if (!Condition->getType()->isVectorTy()) {
	if (Constant *C = LVI->getConstantOnEdge(
	Condition, P->getIncomingBlock(i), BB, P)) {
	if (C->isOneValue()) {
	V = SI->getTrueValue();
	} else if (C->isZeroValue()) {
	V = SI->getFalseValue();
	}
	// Once LVI learns to handle vector types, we could also add support
	// for vector type constants that are not all zeroes or all ones.
	}
	}

	// Look if the select has a constant but LVI tells us that the incoming
	// value can never be that constant. In that case replace the incoming
	// value with the other value of the select. This often allows us to
	// remove the select later.
	if (!V) {
	Constant *C = dyn_cast<Constant>(SI->getFalseValue());
	if (!C) continue;

	if (LVI->getPredicateOnEdge(ICmpInst::ICMP_EQ, SI, C,
	P->getIncomingBlock(i), BB, P) !=
	LazyValueInfo::False)
	continue;
	V = SI->getTrueValue();
	}

	DEBUG(dbgs() << "CVP: Threading PHI over " << *SI << '\n');
	}

	P->setIncomingValue(i, V);
	Changed = true;
	}

	// FIXME: Provide TLI, DT, AT to SimplifyInstruction.
	const DataLayout &DL = BB->getModule()->getDataLayout();
	if (Value *V = SimplifyInstruction(P, DL)) {
	P->replaceAllUsesWith(V);
	P->eraseFromParent();
	Changed = true;
	}

	if (Changed)
	++NumPhis;

	return Changed;
	}

	static bool processMemAccess(Instruction I, LazyValueInfo LVI) {
	Value *Pointer = nullptr;
	if (LoadInst *L = dyn_cast<LoadInst>(I))
	Pointer = L->getPointerOperand();
	else
	Pointer = cast<StoreInst>(I)->getPointerOperand();

	if (isa<Constant>(Pointer)) return false;

	Constant *C = LVI->getConstant(Pointer, I->getParent(), I);
	if (!C) return false;

	++NumMemAccess;
	I->replaceUsesOfWith(Pointer, C);
	return true;
	}

	/// See if LazyValueInfo's ability to exploit edge conditions or range
	/// information is sufficient to prove this comparison. Even for local
	/// conditions, this can sometimes prove conditions instcombine can't by
	/// exploiting range information.
	static bool processCmp(CmpInst C, LazyValueInfo LVI) {
	Value *Op0 = C->getOperand(0);
	Constant *Op1 = dyn_cast<Constant>(C->getOperand(1));
	if (!Op1) return false;

	// As a policy choice, we choose not to waste compile time on anything where
	// the comparison is testing local values. While LVI can sometimes reason
	// about such cases, it's not its primary purpose. We do make sure to do
	// the block local query for uses from terminator instructions, but that's
	// handled in the code for each terminator.
	auto *I = dyn_cast<Instruction>(Op0);
	if (I && I->getParent() == C->getParent())
	return false;

	LazyValueInfo::Tristate Result =
	LVI->getPredicateAt(C->getPredicate(), Op0, Op1, C);
	if (Result == LazyValueInfo::Unknown) return false;

	++NumCmps;
	if (Result == LazyValueInfo::True)
	C->replaceAllUsesWith(ConstantInt::getTrue(C->getContext()));
	else
	C->replaceAllUsesWith(ConstantInt::getFalse(C->getContext()));
	C->eraseFromParent();

	return true;
	}

	/// Simplify a switch instruction by removing cases which can never fire. If the
	/// uselessness of a case could be determined locally then constant propagation
	/// would already have figured it out. Instead, walk the predecessors and
	/// statically evaluate cases based on information available on that edge. Cases
	/// that cannot fire no matter what the incoming edge can safely be removed. If
	/// a case fires on every incoming edge then the entire switch can be removed
	/// and replaced with a branch to the case destination.
	static bool processSwitch(SwitchInst SI, LazyValueInfo LVI) {
	Value *Cond = SI->getCondition();
	BasicBlock *BB = SI->getParent();

	// If the condition was defined in same block as the switch then LazyValueInfo
	// currently won't say anything useful about it, though in theory it could.
	if (isa<Instruction>(Cond) && cast<Instruction>(Cond)->getParent() == BB)
	return false;

	// If the switch is unreachable then trying to improve it is a waste of time.
	pred_iterator PB = pred_begin(BB), PE = pred_end(BB);
	if (PB == PE) return false;

	// Analyse each switch case in turn. This is done in reverse order so that
	// removing a case doesn't cause trouble for the iteration.
	bool Changed = false;
	for (SwitchInst::CaseIt CI = SI->case_end(), CE = SI->case_begin(); CI-- != CE;
	) {
	ConstantInt *Case = CI.getCaseValue();

	// Check to see if the switch condition is equal to/not equal to the case
	// value on every incoming edge, equal/not equal being the same each time.
	LazyValueInfo::Tristate State = LazyValueInfo::Unknown;
	for (pred_iterator PI = PB; PI != PE; ++PI) {
	// Is the switch condition equal to the case value?
	LazyValueInfo::Tristate Value = LVI->getPredicateOnEdge(CmpInst::ICMP_EQ,
	Cond, Case, *PI,
	BB, SI);
	// Give up on this case if nothing is known.
	if (Value == LazyValueInfo::Unknown) {
	State = LazyValueInfo::Unknown;
	break;
	}

	// If this was the first edge to be visited, record that all other edges
	// need to give the same result.
	if (PI == PB) {
	State = Value;
	continue;
	}

	// If this case is known to fire for some edges and known not to fire for
	// others then there is nothing we can do - give up.
	if (Value != State) {
	State = LazyValueInfo::Unknown;
	break;
	}
	}

	if (State == LazyValueInfo::False) {
	// This case never fires - remove it.
	CI.getCaseSuccessor()->removePredecessor(BB);
	SI->removeCase(CI); // Does not invalidate the iterator.

	// The condition can be modified by removePredecessor's PHI simplification
	// logic.
	Cond = SI->getCondition();

	++NumDeadCases;
	Changed = true;
	} else if (State == LazyValueInfo::True) {
	// This case always fires. Arrange for the switch to be turned into an
	// unconditional branch by replacing the switch condition with the case
	// value.
	SI->setCondition(Case);
	NumDeadCases += SI->getNumCases();
	Changed = true;
	break;
	}
	}

	if (Changed)
	// If the switch has been simplified to the point where it can be replaced
	// by a branch then do so now.
	ConstantFoldTerminator(BB);

	return Changed;
	}

	/// Infer nonnull attributes for the arguments at the specified callsite.
	static bool processCallSite(CallSite CS, LazyValueInfo *LVI) {
	SmallVector<unsigned, 4> Indices;
	unsigned ArgNo = 0;

	for (Value *V : CS.args()) {
	PointerType *Type = dyn_cast<PointerType>(V->getType());
	// Try to mark pointer typed parameters as non-null. We skip the
	// relatively expensive analysis for constants which are obviously either
	// null or non-null to start with.
	if (Type && !CS.paramHasAttr(ArgNo + 1, Attribute::NonNull) &&
	!isa<Constant>(V) &&
	LVI->getPredicateAt(ICmpInst::ICMP_EQ, V,
	ConstantPointerNull::get(Type),
	CS.getInstruction()) == LazyValueInfo::False)
	Indices.push_back(ArgNo + 1);
	ArgNo++;
	}

	assert(ArgNo == CS.arg_size() && "sanity check");

	if (Indices.empty())
	return false;

	AttributeSet AS = CS.getAttributes();
	LLVMContext &Ctx = CS.getInstruction()->getContext();
	AS = AS.addAttribute(Ctx, Indices, Attribute::get(Ctx, Attribute::NonNull));
	CS.setAttributes(AS);

	return true;
	}

	// Helper function to rewrite srem and sdiv. As a policy choice, we choose not
	// to waste compile time on anything where the operands are local defs. While
	// LVI can sometimes reason about such cases, it's not its primary purpose.
	static bool hasLocalDefs(BinaryOperator *SDI) {
	for (Value *O : SDI->operands()) {
	auto *I = dyn_cast<Instruction>(O);
	if (I && I->getParent() == SDI->getParent())
	return true;
	}
	return false;
	}

	static bool hasPositiveOperands(BinaryOperator SDI, LazyValueInfo LVI) {
	Constant *Zero = ConstantInt::get(SDI->getType(), 0);
	for (Value *O : SDI->operands()) {
	auto Result = LVI->getPredicateAt(ICmpInst::ICMP_SGE, O, Zero, SDI);
	if (Result != LazyValueInfo::True)
	return false;
	}
	return true;
	}

	static bool processSRem(BinaryOperator SDI, LazyValueInfo LVI) {
	if (SDI->getType()->isVectorTy() \|\| hasLocalDefs(SDI) \|\|
	!hasPositiveOperands(SDI, LVI))
	return false;

	++NumSRems;
	auto *BO = BinaryOperator::CreateURem(SDI->getOperand(0), SDI->getOperand(1),
	SDI->getName(), SDI);
	SDI->replaceAllUsesWith(BO);
	SDI->eraseFromParent();
	return true;
	}

	/// See if LazyValueInfo's ability to exploit edge conditions or range
	/// information is sufficient to prove the both operands of this SDiv are
	/// positive. If this is the case, replace the SDiv with a UDiv. Even for local
	/// conditions, this can sometimes prove conditions instcombine can't by
	/// exploiting range information.
	static bool processSDiv(BinaryOperator SDI, LazyValueInfo LVI) {
	if (SDI->getType()->isVectorTy() \|\| hasLocalDefs(SDI) \|\|
	!hasPositiveOperands(SDI, LVI))
	return false;

	++NumSDivs;
	auto *BO = BinaryOperator::CreateUDiv(SDI->getOperand(0), SDI->getOperand(1),
	SDI->getName(), SDI);
	BO->setIsExact(SDI->isExact());
	SDI->replaceAllUsesWith(BO);
	SDI->eraseFromParent();

	return true;
	}

	static bool processAShr(BinaryOperator SDI, LazyValueInfo LVI) {
	if (SDI->getType()->isVectorTy() \|\| hasLocalDefs(SDI))
	return false;

	Constant *Zero = ConstantInt::get(SDI->getType(), 0);
	if (LVI->getPredicateAt(ICmpInst::ICMP_SGE, SDI->getOperand(0), Zero, SDI) !=
	LazyValueInfo::True)
	return false;

	++NumAShrs;
	auto *BO = BinaryOperator::CreateLShr(SDI->getOperand(0), SDI->getOperand(1),
	SDI->getName(), SDI);
	BO->setIsExact(SDI->isExact());
	SDI->replaceAllUsesWith(BO);
	SDI->eraseFromParent();

	return true;
	}

	static bool processAdd(BinaryOperator AddOp, LazyValueInfo LVI) {
	typedef OverflowingBinaryOperator OBO;
	+
	+ if (DontProcessAdds)
	+ return false;

	if (AddOp->getType()->isVectorTy() \|\| hasLocalDefs(AddOp))
	return false;

	bool NSW = AddOp->hasNoSignedWrap();
	bool NUW = AddOp->hasNoUnsignedWrap();
	if (NSW && NUW)
	return false;

	BasicBlock *BB = AddOp->getParent();

	Value *LHS = AddOp->getOperand(0);
	Value *RHS = AddOp->getOperand(1);

	ConstantRange LRange = LVI->getConstantRange(LHS, BB, AddOp);

	// Initialize RRange only if we need it. If we know that guaranteed no wrap
	// range for the given LHS range is empty don't spend time calculating the
	// range for the RHS.
	Optional<ConstantRange> RRange;
	auto LazyRRange = [&] () {
	if (!RRange)
	RRange = LVI->getConstantRange(RHS, BB, AddOp);
	return RRange.getValue();
	};

	bool Changed = false;
	if (!NUW) {
	ConstantRange NUWRange =
	LRange.makeGuaranteedNoWrapRegion(BinaryOperator::Add, LRange,
	OBO::NoUnsignedWrap);
	if (!NUWRange.isEmptySet()) {
	bool NewNUW = NUWRange.contains(LazyRRange());
	AddOp->setHasNoUnsignedWrap(NewNUW);
	Changed \|= NewNUW;
	}
	}
	if (!NSW) {
	ConstantRange NSWRange =
	LRange.makeGuaranteedNoWrapRegion(BinaryOperator::Add, LRange,
	OBO::NoSignedWrap);
	if (!NSWRange.isEmptySet()) {
	bool NewNSW = NSWRange.contains(LazyRRange());
	AddOp->setHasNoSignedWrap(NewNSW);
	Changed \|= NewNSW;
	}
	}

	return Changed;
	}

	static Constant getConstantAt(Value V, Instruction At, LazyValueInfo LVI) {
	if (Constant *C = LVI->getConstant(V, At->getParent(), At))
	return C;

	// TODO: The following really should be sunk inside LVI's core algorithm, or
	// at least the outer shims around such.
	auto *C = dyn_cast<CmpInst>(V);
	if (!C) return nullptr;

	Value *Op0 = C->getOperand(0);
	Constant *Op1 = dyn_cast<Constant>(C->getOperand(1));
	if (!Op1) return nullptr;

	LazyValueInfo::Tristate Result =
	LVI->getPredicateAt(C->getPredicate(), Op0, Op1, At);
	if (Result == LazyValueInfo::Unknown)
	return nullptr;

	return (Result == LazyValueInfo::True) ?
	ConstantInt::getTrue(C->getContext()) :
	ConstantInt::getFalse(C->getContext());
	}

	static bool runImpl(Function &F, LazyValueInfo *LVI) {
	bool FnChanged = false;

	// Visiting in a pre-order depth-first traversal causes us to simplify early
	// blocks before querying later blocks (which require us to analyze early
	// blocks). Eagerly simplifying shallow blocks means there is strictly less
	// work to do for deep blocks. This also means we don't visit unreachable
	// blocks.
	for (BasicBlock *BB : depth_first(&F.getEntryBlock())) {
	bool BBChanged = false;
	for (BasicBlock::iterator BI = BB->begin(), BE = BB->end(); BI != BE;) {
	Instruction II = &BI++;
	switch (II->getOpcode()) {
	case Instruction::Select:
	BBChanged \|= processSelect(cast<SelectInst>(II), LVI);
	break;
	case Instruction::PHI:
	BBChanged \|= processPHI(cast<PHINode>(II), LVI);
	break;
	case Instruction::ICmp:
	case Instruction::FCmp:
	BBChanged \|= processCmp(cast<CmpInst>(II), LVI);
	break;
	case Instruction::Load:
	case Instruction::Store:
	BBChanged \|= processMemAccess(II, LVI);
	break;
	case Instruction::Call:
	case Instruction::Invoke:
	BBChanged \|= processCallSite(CallSite(II), LVI);
	break;
	case Instruction::SRem:
	BBChanged \|= processSRem(cast<BinaryOperator>(II), LVI);
	break;
	case Instruction::SDiv:
	BBChanged \|= processSDiv(cast<BinaryOperator>(II), LVI);
	break;
	case Instruction::AShr:
	BBChanged \|= processAShr(cast<BinaryOperator>(II), LVI);
	break;
	case Instruction::Add:
	BBChanged \|= processAdd(cast<BinaryOperator>(II), LVI);
	break;
	}
	}

	Instruction *Term = BB->getTerminator();
	switch (Term->getOpcode()) {
	case Instruction::Switch:
	BBChanged \|= processSwitch(cast<SwitchInst>(Term), LVI);
	break;
	case Instruction::Ret: {
	auto *RI = cast<ReturnInst>(Term);
	// Try to determine the return value if we can. This is mainly here to
	// simplify the writing of unit tests, but also helps to enable IPO by
	// constant folding the return values of callees.
	auto *RetVal = RI->getReturnValue();
	if (!RetVal) break; // handle "ret void"
	if (isa<Constant>(RetVal)) break; // nothing to do
	if (auto *C = getConstantAt(RetVal, RI, LVI)) {
	++NumReturns;
	RI->replaceUsesOfWith(RetVal, C);
	BBChanged = true;
	}
	}
	};

	FnChanged \|= BBChanged;
	}

	return FnChanged;
	}

	bool CorrelatedValuePropagation::runOnFunction(Function &F) {
	if (skipFunction(F))
	return false;

	LazyValueInfo *LVI = &getAnalysis<LazyValueInfoWrapperPass>().getLVI();
	return runImpl(F, LVI);
	}

	PreservedAnalyses
	CorrelatedValuePropagationPass::run(Function &F, FunctionAnalysisManager &AM) {

	LazyValueInfo *LVI = &AM.getResult<LazyValueAnalysis>(F);
	bool Changed = runImpl(F, LVI);

	// FIXME: We need to invalidate LVI to avoid PR28400. Is there a better
	// solution?
	AM.invalidate<LazyValueAnalysis>(F);

	if (!Changed)
	return PreservedAnalyses::all();
	PreservedAnalyses PA;
	PA.preserve<GlobalsAA>();
	return PA;
	}
	Index: projects/clang400-import/contrib/llvm/lib/Transforms/Scalar/Reassociate.cpp
	===================================================================
	--- projects/clang400-import/contrib/llvm/lib/Transforms/Scalar/Reassociate.cpp (revision 314268)
	+++ projects/clang400-import/contrib/llvm/lib/Transforms/Scalar/Reassociate.cpp (revision 314269)
	@@ -1,2280 +1,2280 @@
	//===- Reassociate.cpp - Reassociate binary expressions -------------------===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	//
	// This pass reassociates commutative expressions in an order that is designed
	// to promote better constant propagation, GCSE, LICM, PRE, etc.
	//
	// For example: 4 + (x + 5) -> x + (4 + 5)
	//
	// In the implementation of this algorithm, constants are assigned rank = 0,
	// function arguments are rank = 1, and other values are assigned ranks
	// corresponding to the reverse post order traversal of current function
	// (starting at 2), which effectively gives values in deep loops higher rank
	// than values not in loops.
	//
	//===----------------------------------------------------------------------===//

	#include "llvm/Transforms/Scalar/Reassociate.h"
	#include "llvm/ADT/DenseMap.h"
	#include "llvm/ADT/PostOrderIterator.h"
	#include "llvm/ADT/STLExtras.h"
	#include "llvm/ADT/SetVector.h"
	#include "llvm/ADT/Statistic.h"
	#include "llvm/Analysis/GlobalsModRef.h"
	#include "llvm/Analysis/ValueTracking.h"
	#include "llvm/IR/CFG.h"
	#include "llvm/IR/Constants.h"
	#include "llvm/IR/DerivedTypes.h"
	#include "llvm/IR/Function.h"
	#include "llvm/IR/IRBuilder.h"
	#include "llvm/IR/Instructions.h"
	#include "llvm/IR/IntrinsicInst.h"
	#include "llvm/IR/ValueHandle.h"
	#include "llvm/Pass.h"
	#include "llvm/Support/Debug.h"
	#include "llvm/Support/raw_ostream.h"
	#include "llvm/Transforms/Scalar.h"
	#include "llvm/Transforms/Utils/Local.h"
	#include <algorithm>
	using namespace llvm;
	using namespace reassociate;

	#define DEBUG_TYPE "reassociate"

	STATISTIC(NumChanged, "Number of insts reassociated");
	STATISTIC(NumAnnihil, "Number of expr tree annihilated");
	STATISTIC(NumFactor , "Number of multiplies factored");

	#ifndef NDEBUG
	/// Print out the expression identified in the Ops list.
	///
	static void PrintOps(Instruction *I, const SmallVectorImpl<ValueEntry> &Ops) {
	Module *M = I->getModule();
	dbgs() << Instruction::getOpcodeName(I->getOpcode()) << " "
	<< *Ops[0].Op->getType() << '\t';
	for (unsigned i = 0, e = Ops.size(); i != e; ++i) {
	dbgs() << "[ ";
	Ops[i].Op->printAsOperand(dbgs(), false, M);
	dbgs() << ", #" << Ops[i].Rank << "] ";
	}
	}
	#endif

	/// Utility class representing a non-constant Xor-operand. We classify
	/// non-constant Xor-Operands into two categories:
	/// C1) The operand is in the form "X & C", where C is a constant and C != ~0
	/// C2)
	/// C2.1) The operand is in the form of "X \| C", where C is a non-zero
	/// constant.
	/// C2.2) Any operand E which doesn't fall into C1 and C2.1, we view this
	/// operand as "E \| 0"
	class llvm::reassociate::XorOpnd {
	public:
	XorOpnd(Value *V);

	bool isInvalid() const { return SymbolicPart == nullptr; }
	bool isOrExpr() const { return isOr; }
	Value *getValue() const { return OrigVal; }
	Value *getSymbolicPart() const { return SymbolicPart; }
	unsigned getSymbolicRank() const { return SymbolicRank; }
	const APInt &getConstPart() const { return ConstPart; }

	void Invalidate() { SymbolicPart = OrigVal = nullptr; }
	void setSymbolicRank(unsigned R) { SymbolicRank = R; }

	private:
	Value *OrigVal;
	Value *SymbolicPart;
	APInt ConstPart;
	unsigned SymbolicRank;
	bool isOr;
	};

	XorOpnd::XorOpnd(Value *V) {
	assert(!isa<ConstantInt>(V) && "No ConstantInt");
	OrigVal = V;
	Instruction *I = dyn_cast<Instruction>(V);
	SymbolicRank = 0;

	if (I && (I->getOpcode() == Instruction::Or \|\|
	I->getOpcode() == Instruction::And)) {
	Value *V0 = I->getOperand(0);
	Value *V1 = I->getOperand(1);
	if (isa<ConstantInt>(V0))
	std::swap(V0, V1);

	if (ConstantInt *C = dyn_cast<ConstantInt>(V1)) {
	ConstPart = C->getValue();
	SymbolicPart = V0;
	isOr = (I->getOpcode() == Instruction::Or);
	return;
	}
	}

	// view the operand as "V \| 0"
	SymbolicPart = V;
	ConstPart = APInt::getNullValue(V->getType()->getIntegerBitWidth());
	isOr = true;
	}

	/// Return true if V is an instruction of the specified opcode and if it
	/// only has one use.
	static BinaryOperator isReassociableOp(Value V, unsigned Opcode) {
	if (V->hasOneUse() && isa<Instruction>(V) &&
	cast<Instruction>(V)->getOpcode() == Opcode &&
	(!isa<FPMathOperator>(V) \|\|
	cast<Instruction>(V)->hasUnsafeAlgebra()))
	return cast<BinaryOperator>(V);
	return nullptr;
	}

	static BinaryOperator isReassociableOp(Value V, unsigned Opcode1,
	unsigned Opcode2) {
	if (V->hasOneUse() && isa<Instruction>(V) &&
	(cast<Instruction>(V)->getOpcode() == Opcode1 \|\|
	cast<Instruction>(V)->getOpcode() == Opcode2) &&
	(!isa<FPMathOperator>(V) \|\|
	cast<Instruction>(V)->hasUnsafeAlgebra()))
	return cast<BinaryOperator>(V);
	return nullptr;
	}

	void ReassociatePass::BuildRankMap(Function &F,
	ReversePostOrderTraversal<Function*> &RPOT) {
	unsigned i = 2;

	// Assign distinct ranks to function arguments.
	for (Function::arg_iterator I = F.arg_begin(), E = F.arg_end(); I != E; ++I) {
	ValueRankMap[&*I] = ++i;
	DEBUG(dbgs() << "Calculated Rank[" << I->getName() << "] = " << i << "\n");
	}

	// Traverse basic blocks in ReversePostOrder
	for (BasicBlock *BB : RPOT) {
	unsigned BBRank = RankMap[BB] = ++i << 16;

	// Walk the basic block, adding precomputed ranks for any instructions that
	// we cannot move. This ensures that the ranks for these instructions are
	// all different in the block.
	for (Instruction &I : *BB)
	if (mayBeMemoryDependent(I))
	ValueRankMap[&I] = ++BBRank;
	}
	}

	unsigned ReassociatePass::getRank(Value *V) {
	Instruction *I = dyn_cast<Instruction>(V);
	if (!I) {
	if (isa<Argument>(V)) return ValueRankMap[V]; // Function argument.
	return 0; // Otherwise it's a global or constant, rank 0.
	}

	if (unsigned Rank = ValueRankMap[I])
	return Rank; // Rank already known?

	// If this is an expression, return the 1+MAX(rank(LHS), rank(RHS)) so that
	// we can reassociate expressions for code motion! Since we do not recurse
	// for PHI nodes, we cannot have infinite recursion here, because there
	// cannot be loops in the value graph that do not go through PHI nodes.
	unsigned Rank = 0, MaxRank = RankMap[I->getParent()];
	for (unsigned i = 0, e = I->getNumOperands();
	i != e && Rank != MaxRank; ++i)
	Rank = std::max(Rank, getRank(I->getOperand(i)));

	// If this is a not or neg instruction, do not count it for rank. This
	// assures us that X and ~X will have the same rank.
	if (!BinaryOperator::isNot(I) && !BinaryOperator::isNeg(I) &&
	!BinaryOperator::isFNeg(I))
	++Rank;

	DEBUG(dbgs() << "Calculated Rank[" << V->getName() << "] = " << Rank << "\n");

	return ValueRankMap[I] = Rank;
	}

	// Canonicalize constants to RHS. Otherwise, sort the operands by rank.
	void ReassociatePass::canonicalizeOperands(Instruction *I) {
	assert(isa<BinaryOperator>(I) && "Expected binary operator.");
	assert(I->isCommutative() && "Expected commutative operator.");

	Value *LHS = I->getOperand(0);
	Value *RHS = I->getOperand(1);
	unsigned LHSRank = getRank(LHS);
	unsigned RHSRank = getRank(RHS);

	if (isa<Constant>(RHS))
	return;

	if (isa<Constant>(LHS) \|\| RHSRank < LHSRank)
	cast<BinaryOperator>(I)->swapOperands();
	}

	static BinaryOperator CreateAdd(Value S1, Value *S2, const Twine &Name,
	Instruction InsertBefore, Value FlagsOp) {
	if (S1->getType()->isIntOrIntVectorTy())
	return BinaryOperator::CreateAdd(S1, S2, Name, InsertBefore);
	else {
	BinaryOperator *Res =
	BinaryOperator::CreateFAdd(S1, S2, Name, InsertBefore);
	Res->setFastMathFlags(cast<FPMathOperator>(FlagsOp)->getFastMathFlags());
	return Res;
	}
	}

	static BinaryOperator CreateMul(Value S1, Value *S2, const Twine &Name,
	Instruction InsertBefore, Value FlagsOp) {
	if (S1->getType()->isIntOrIntVectorTy())
	return BinaryOperator::CreateMul(S1, S2, Name, InsertBefore);
	else {
	BinaryOperator *Res =
	BinaryOperator::CreateFMul(S1, S2, Name, InsertBefore);
	Res->setFastMathFlags(cast<FPMathOperator>(FlagsOp)->getFastMathFlags());
	return Res;
	}
	}

	static BinaryOperator CreateNeg(Value S1, const Twine &Name,
	Instruction InsertBefore, Value FlagsOp) {
	if (S1->getType()->isIntOrIntVectorTy())
	return BinaryOperator::CreateNeg(S1, Name, InsertBefore);
	else {
	BinaryOperator *Res = BinaryOperator::CreateFNeg(S1, Name, InsertBefore);
	Res->setFastMathFlags(cast<FPMathOperator>(FlagsOp)->getFastMathFlags());
	return Res;
	}
	}

	/// Replace 0-X with X*-1.
	static BinaryOperator LowerNegateToMultiply(Instruction Neg) {
	Type *Ty = Neg->getType();
	Constant *NegOne = Ty->isIntOrIntVectorTy() ?
	ConstantInt::getAllOnesValue(Ty) : ConstantFP::get(Ty, -1.0);

	BinaryOperator *Res = CreateMul(Neg->getOperand(1), NegOne, "", Neg, Neg);
	Neg->setOperand(1, Constant::getNullValue(Ty)); // Drop use of op.
	Res->takeName(Neg);
	Neg->replaceAllUsesWith(Res);
	Res->setDebugLoc(Neg->getDebugLoc());
	return Res;
	}

	/// Returns k such that lambda(2^Bitwidth) = 2^k, where lambda is the Carmichael
	/// function. This means that x^(2^k) === 1 mod 2^Bitwidth for
	/// every odd x, i.e. x^(2^k) = 1 for every odd x in Bitwidth-bit arithmetic.
	/// Note that 0 <= k < Bitwidth, and if Bitwidth > 3 then x^(2^k) = 0 for every
	/// even x in Bitwidth-bit arithmetic.
	static unsigned CarmichaelShift(unsigned Bitwidth) {
	if (Bitwidth < 3)
	return Bitwidth - 1;
	return Bitwidth - 2;
	}

	/// Add the extra weight 'RHS' to the existing weight 'LHS',
	/// reducing the combined weight using any special properties of the operation.
	/// The existing weight LHS represents the computation X op X op ... op X where
	/// X occurs LHS times. The combined weight represents X op X op ... op X with
	/// X occurring LHS + RHS times. If op is "Xor" for example then the combined
	/// operation is equivalent to X if LHS + RHS is odd, or 0 if LHS + RHS is even;
	/// the routine returns 1 in LHS in the first case, and 0 in LHS in the second.
	static void IncorporateWeight(APInt &LHS, const APInt &RHS, unsigned Opcode) {
	// If we were working with infinite precision arithmetic then the combined
	// weight would be LHS + RHS. But we are using finite precision arithmetic,
	// and the APInt sum LHS + RHS may not be correct if it wraps (it is correct
	// for nilpotent operations and addition, but not for idempotent operations
	// and multiplication), so it is important to correctly reduce the combined
	// weight back into range if wrapping would be wrong.

	// If RHS is zero then the weight didn't change.
	if (RHS.isMinValue())
	return;
	// If LHS is zero then the combined weight is RHS.
	if (LHS.isMinValue()) {
	LHS = RHS;
	return;
	}
	// From this point on we know that neither LHS nor RHS is zero.

	if (Instruction::isIdempotent(Opcode)) {
	// Idempotent means X op X === X, so any non-zero weight is equivalent to a
	// weight of 1. Keeping weights at zero or one also means that wrapping is
	// not a problem.
	assert(LHS == 1 && RHS == 1 && "Weights not reduced!");
	return; // Return a weight of 1.
	}
	if (Instruction::isNilpotent(Opcode)) {
	// Nilpotent means X op X === 0, so reduce weights modulo 2.
	assert(LHS == 1 && RHS == 1 && "Weights not reduced!");
	LHS = 0; // 1 + 1 === 0 modulo 2.
	return;
	}
	if (Opcode == Instruction::Add \|\| Opcode == Instruction::FAdd) {
	// TODO: Reduce the weight by exploiting nsw/nuw?
	LHS += RHS;
	return;
	}

	assert((Opcode == Instruction::Mul \|\| Opcode == Instruction::FMul) &&
	"Unknown associative operation!");
	unsigned Bitwidth = LHS.getBitWidth();
	// If CM is the Carmichael number then a weight W satisfying W >= CM+Bitwidth
	// can be replaced with W-CM. That's because x^W=x^(W-CM) for every Bitwidth
	// bit number x, since either x is odd in which case x^CM = 1, or x is even in
	// which case both x^W and x^(W - CM) are zero. By subtracting off multiples
	// of CM like this weights can always be reduced to the range [0, CM+Bitwidth)
	// which by a happy accident means that they can always be represented using
	// Bitwidth bits.
	// TODO: Reduce the weight by exploiting nsw/nuw? (Could do much better than
	// the Carmichael number).
	if (Bitwidth > 3) {
	/// CM - The value of Carmichael's lambda function.
	APInt CM = APInt::getOneBitSet(Bitwidth, CarmichaelShift(Bitwidth));
	// Any weight W >= Threshold can be replaced with W - CM.
	APInt Threshold = CM + Bitwidth;
	assert(LHS.ult(Threshold) && RHS.ult(Threshold) && "Weights not reduced!");
	// For Bitwidth 4 or more the following sum does not overflow.
	LHS += RHS;
	while (LHS.uge(Threshold))
	LHS -= CM;
	} else {
	// To avoid problems with overflow do everything the same as above but using
	// a larger type.
	unsigned CM = 1U << CarmichaelShift(Bitwidth);
	unsigned Threshold = CM + Bitwidth;
	assert(LHS.getZExtValue() < Threshold && RHS.getZExtValue() < Threshold &&
	"Weights not reduced!");
	unsigned Total = LHS.getZExtValue() + RHS.getZExtValue();
	while (Total >= Threshold)
	Total -= CM;
	LHS = Total;
	}
	}

	typedef std::pair<Value*, APInt> RepeatedValue;

	/// Given an associative binary expression, return the leaf
	/// nodes in Ops along with their weights (how many times the leaf occurs). The
	/// original expression is the same as
	/// (Ops[0].first op Ops[0].first op ... Ops[0].first) <- Ops[0].second times
	/// op
	/// (Ops[1].first op Ops[1].first op ... Ops[1].first) <- Ops[1].second times
	/// op
	/// ...
	/// op
	/// (Ops[N].first op Ops[N].first op ... Ops[N].first) <- Ops[N].second times
	///
	/// Note that the values Ops[0].first, ..., Ops[N].first are all distinct.
	///
	/// This routine may modify the function, in which case it returns 'true'. The
	/// changes it makes may well be destructive, changing the value computed by 'I'
	/// to something completely different. Thus if the routine returns 'true' then
	/// you MUST either replace I with a new expression computed from the Ops array,
	/// or use RewriteExprTree to put the values back in.
	///
	/// A leaf node is either not a binary operation of the same kind as the root
	/// node 'I' (i.e. is not a binary operator at all, or is, but with a different
	/// opcode), or is the same kind of binary operator but has a use which either
	/// does not belong to the expression, or does belong to the expression but is
	/// a leaf node. Every leaf node has at least one use that is a non-leaf node
	/// of the expression, while for non-leaf nodes (except for the root 'I') every
	/// use is a non-leaf node of the expression.
	///
	/// For example:
	/// expression graph node names
	///
	/// + \| I
	/// / \ \|
	/// + + \| A, B
	/// / \ / \ \|
	/// * + * \| C, D, E
	/// / \ / \ / \ \|
	/// + * \| F, G
	///
	/// The leaf nodes are C, E, F and G. The Ops array will contain (maybe not in
	/// that order) (C, 1), (E, 1), (F, 2), (G, 2).
	///
	/// The expression is maximal: if some instruction is a binary operator of the
	/// same kind as 'I', and all of its uses are non-leaf nodes of the expression,
	/// then the instruction also belongs to the expression, is not a leaf node of
	/// it, and its operands also belong to the expression (but may be leaf nodes).
	///
	/// NOTE: This routine will set operands of non-leaf non-root nodes to undef in
	/// order to ensure that every non-root node in the expression has exactly one
	/// use by a non-leaf node of the expression. This destruction means that the
	/// caller MUST either replace 'I' with a new expression or use something like
	/// RewriteExprTree to put the values back in if the routine indicates that it
	/// made a change by returning 'true'.
	///
	/// In the above example either the right operand of A or the left operand of B
	/// will be replaced by undef. If it is B's operand then this gives:
	///
	/// + \| I
	/// / \ \|
	/// + + \| A, B - operand of B replaced with undef
	/// / \ \ \|
	/// * + * \| C, D, E
	/// / \ / \ / \ \|
	/// + * \| F, G
	///
	/// Note that such undef operands can only be reached by passing through 'I'.
	/// For example, if you visit operands recursively starting from a leaf node
	/// then you will never see such an undef operand unless you get back to 'I',
	/// which requires passing through a phi node.
	///
	/// Note that this routine may also mutate binary operators of the wrong type
	/// that have all uses inside the expression (i.e. only used by non-leaf nodes
	/// of the expression) if it can turn them into binary operators of the right
	/// type and thus make the expression bigger.

	static bool LinearizeExprTree(BinaryOperator *I,
	SmallVectorImpl<RepeatedValue> &Ops) {
	DEBUG(dbgs() << "LINEARIZE: " << *I << '\n');
	unsigned Bitwidth = I->getType()->getScalarType()->getPrimitiveSizeInBits();
	unsigned Opcode = I->getOpcode();
	assert(I->isAssociative() && I->isCommutative() &&
	"Expected an associative and commutative operation!");

	// Visit all operands of the expression, keeping track of their weight (the
	// number of paths from the expression root to the operand, or if you like
	// the number of times that operand occurs in the linearized expression).
	// For example, if I = X + A, where X = A + B, then I, X and B have weight 1
	// while A has weight two.

	// Worklist of non-leaf nodes (their operands are in the expression too) along
	// with their weights, representing a certain number of paths to the operator.
	// If an operator occurs in the worklist multiple times then we found multiple
	// ways to get to it.
	SmallVector<std::pair<BinaryOperator*, APInt>, 8> Worklist; // (Op, Weight)
	Worklist.push_back(std::make_pair(I, APInt(Bitwidth, 1)));
	bool Changed = false;

	// Leaves of the expression are values that either aren't the right kind of
	// operation (eg: a constant, or a multiply in an add tree), or are, but have
	// some uses that are not inside the expression. For example, in I = X + X,
	// X = A + B, the value X has two uses (by I) that are in the expression. If
	// X has any other uses, for example in a return instruction, then we consider
	// X to be a leaf, and won't analyze it further. When we first visit a value,
	// if it has more than one use then at first we conservatively consider it to
	// be a leaf. Later, as the expression is explored, we may discover some more
	// uses of the value from inside the expression. If all uses turn out to be
	// from within the expression (and the value is a binary operator of the right
	// kind) then the value is no longer considered to be a leaf, and its operands
	// are explored.

	// Leaves - Keeps track of the set of putative leaves as well as the number of
	// paths to each leaf seen so far.
	typedef DenseMap<Value*, APInt> LeafMap;
	LeafMap Leaves; // Leaf -> Total weight so far.
	SmallVector<Value*, 8> LeafOrder; // Ensure deterministic leaf output order.

	#ifndef NDEBUG
	SmallPtrSet<Value*, 8> Visited; // For sanity checking the iteration scheme.
	#endif
	while (!Worklist.empty()) {
	std::pair<BinaryOperator*, APInt> P = Worklist.pop_back_val();
	I = P.first; // We examine the operands of this binary operator.

	for (unsigned OpIdx = 0; OpIdx < 2; ++OpIdx) { // Visit operands.
	Value *Op = I->getOperand(OpIdx);
	APInt Weight = P.second; // Number of paths to this operand.
	DEBUG(dbgs() << "OPERAND: " << *Op << " (" << Weight << ")\n");
	assert(!Op->use_empty() && "No uses, so how did we get to it?!");

	// If this is a binary operation of the right kind with only one use then
	// add its operands to the expression.
	if (BinaryOperator *BO = isReassociableOp(Op, Opcode)) {
	assert(Visited.insert(Op).second && "Not first visit!");
	DEBUG(dbgs() << "DIRECT ADD: " << *Op << " (" << Weight << ")\n");
	Worklist.push_back(std::make_pair(BO, Weight));
	continue;
	}

	// Appears to be a leaf. Is the operand already in the set of leaves?
	LeafMap::iterator It = Leaves.find(Op);
	if (It == Leaves.end()) {
	// Not in the leaf map. Must be the first time we saw this operand.
	assert(Visited.insert(Op).second && "Not first visit!");
	if (!Op->hasOneUse()) {
	// This value has uses not accounted for by the expression, so it is
	// not safe to modify. Mark it as being a leaf.
	DEBUG(dbgs() << "ADD USES LEAF: " << *Op << " (" << Weight << ")\n");
	LeafOrder.push_back(Op);
	Leaves[Op] = Weight;
	continue;
	}
	// No uses outside the expression, try morphing it.
	} else {
	// Already in the leaf map.
	assert(It != Leaves.end() && Visited.count(Op) &&
	"In leaf map but not visited!");

	// Update the number of paths to the leaf.
	IncorporateWeight(It->second, Weight, Opcode);

	#if 0 // TODO: Re-enable once PR13021 is fixed.
	// The leaf already has one use from inside the expression. As we want
	// exactly one such use, drop this new use of the leaf.
	assert(!Op->hasOneUse() && "Only one use, but we got here twice!");
	I->setOperand(OpIdx, UndefValue::get(I->getType()));
	Changed = true;

	// If the leaf is a binary operation of the right kind and we now see
	// that its multiple original uses were in fact all by nodes belonging
	// to the expression, then no longer consider it to be a leaf and add
	// its operands to the expression.
	if (BinaryOperator *BO = isReassociableOp(Op, Opcode)) {
	DEBUG(dbgs() << "UNLEAF: " << *Op << " (" << It->second << ")\n");
	Worklist.push_back(std::make_pair(BO, It->second));
	Leaves.erase(It);
	continue;
	}
	#endif

	// If we still have uses that are not accounted for by the expression
	// then it is not safe to modify the value.
	if (!Op->hasOneUse())
	continue;

	// No uses outside the expression, try morphing it.
	Weight = It->second;
	Leaves.erase(It); // Since the value may be morphed below.
	}

	// At this point we have a value which, first of all, is not a binary
	// expression of the right kind, and secondly, is only used inside the
	// expression. This means that it can safely be modified. See if we
	// can usefully morph it into an expression of the right kind.
	assert((!isa<Instruction>(Op) \|\|
	cast<Instruction>(Op)->getOpcode() != Opcode
	\|\| (isa<FPMathOperator>(Op) &&
	!cast<Instruction>(Op)->hasUnsafeAlgebra())) &&
	"Should have been handled above!");
	assert(Op->hasOneUse() && "Has uses outside the expression tree!");

	// If this is a multiply expression, turn any internal negations into
	// multiplies by -1 so they can be reassociated.
	if (BinaryOperator *BO = dyn_cast<BinaryOperator>(Op))
	if ((Opcode == Instruction::Mul && BinaryOperator::isNeg(BO)) \|\|
	(Opcode == Instruction::FMul && BinaryOperator::isFNeg(BO))) {
	DEBUG(dbgs() << "MORPH LEAF: " << *Op << " (" << Weight << ") TO ");
	BO = LowerNegateToMultiply(BO);
	DEBUG(dbgs() << *BO << '\n');
	Worklist.push_back(std::make_pair(BO, Weight));
	Changed = true;
	continue;
	}

	// Failed to morph into an expression of the right type. This really is
	// a leaf.
	DEBUG(dbgs() << "ADD LEAF: " << *Op << " (" << Weight << ")\n");
	assert(!isReassociableOp(Op, Opcode) && "Value was morphed?");
	LeafOrder.push_back(Op);
	Leaves[Op] = Weight;
	}
	}

	// The leaves, repeated according to their weights, represent the linearized
	// form of the expression.
	for (unsigned i = 0, e = LeafOrder.size(); i != e; ++i) {
	Value *V = LeafOrder[i];
	LeafMap::iterator It = Leaves.find(V);
	if (It == Leaves.end())
	// Node initially thought to be a leaf wasn't.
	continue;
	assert(!isReassociableOp(V, Opcode) && "Shouldn't be a leaf!");
	APInt Weight = It->second;
	if (Weight.isMinValue())
	// Leaf already output or weight reduction eliminated it.
	continue;
	// Ensure the leaf is only output once.
	It->second = 0;
	Ops.push_back(std::make_pair(V, Weight));
	}

	// For nilpotent operations or addition there may be no operands, for example
	// because the expression was "X xor X" or consisted of 2^Bitwidth additions:
	// in both cases the weight reduces to 0 causing the value to be skipped.
	if (Ops.empty()) {
	Constant *Identity = ConstantExpr::getBinOpIdentity(Opcode, I->getType());
	assert(Identity && "Associative operation without identity!");
	Ops.emplace_back(Identity, APInt(Bitwidth, 1));
	}

	return Changed;
	}

	/// Now that the operands for this expression tree are
	/// linearized and optimized, emit them in-order.
	void ReassociatePass::RewriteExprTree(BinaryOperator *I,
	SmallVectorImpl<ValueEntry> &Ops) {
	assert(Ops.size() > 1 && "Single values should be used directly!");

	// Since our optimizations should never increase the number of operations, the
	// new expression can usually be written reusing the existing binary operators
	// from the original expression tree, without creating any new instructions,
	// though the rewritten expression may have a completely different topology.
	// We take care to not change anything if the new expression will be the same
	// as the original. If more than trivial changes (like commuting operands)
	// were made then we are obliged to clear out any optional subclass data like
	// nsw flags.

	/// NodesToRewrite - Nodes from the original expression available for writing
	/// the new expression into.
	SmallVector<BinaryOperator*, 8> NodesToRewrite;
	unsigned Opcode = I->getOpcode();
	BinaryOperator *Op = I;

	/// NotRewritable - The operands being written will be the leaves of the new
	/// expression and must not be used as inner nodes (via NodesToRewrite) by
	/// mistake. Inner nodes are always reassociable, and usually leaves are not
	/// (if they were they would have been incorporated into the expression and so
	/// would not be leaves), so most of the time there is no danger of this. But
	/// in rare cases a leaf may become reassociable if an optimization kills uses
	/// of it, or it may momentarily become reassociable during rewriting (below)
	/// due it being removed as an operand of one of its uses. Ensure that misuse
	/// of leaf nodes as inner nodes cannot occur by remembering all of the future
	/// leaves and refusing to reuse any of them as inner nodes.
	SmallPtrSet<Value*, 8> NotRewritable;
	for (unsigned i = 0, e = Ops.size(); i != e; ++i)
	NotRewritable.insert(Ops[i].Op);

	// ExpressionChanged - Non-null if the rewritten expression differs from the
	// original in some non-trivial way, requiring the clearing of optional flags.
	// Flags are cleared from the operator in ExpressionChanged up to I inclusive.
	BinaryOperator *ExpressionChanged = nullptr;
	for (unsigned i = 0; ; ++i) {
	// The last operation (which comes earliest in the IR) is special as both
	// operands will come from Ops, rather than just one with the other being
	// a subexpression.
	if (i+2 == Ops.size()) {
	Value *NewLHS = Ops[i].Op;
	Value *NewRHS = Ops[i+1].Op;
	Value *OldLHS = Op->getOperand(0);
	Value *OldRHS = Op->getOperand(1);

	if (NewLHS == OldLHS && NewRHS == OldRHS)
	// Nothing changed, leave it alone.
	break;

	if (NewLHS == OldRHS && NewRHS == OldLHS) {
	// The order of the operands was reversed. Swap them.
	DEBUG(dbgs() << "RA: " << *Op << '\n');
	Op->swapOperands();
	DEBUG(dbgs() << "TO: " << *Op << '\n');
	MadeChange = true;
	++NumChanged;
	break;
	}

	// The new operation differs non-trivially from the original. Overwrite
	// the old operands with the new ones.
	DEBUG(dbgs() << "RA: " << *Op << '\n');
	if (NewLHS != OldLHS) {
	BinaryOperator *BO = isReassociableOp(OldLHS, Opcode);
	if (BO && !NotRewritable.count(BO))
	NodesToRewrite.push_back(BO);
	Op->setOperand(0, NewLHS);
	}
	if (NewRHS != OldRHS) {
	BinaryOperator *BO = isReassociableOp(OldRHS, Opcode);
	if (BO && !NotRewritable.count(BO))
	NodesToRewrite.push_back(BO);
	Op->setOperand(1, NewRHS);
	}
	DEBUG(dbgs() << "TO: " << *Op << '\n');

	ExpressionChanged = Op;
	MadeChange = true;
	++NumChanged;

	break;
	}

	// Not the last operation. The left-hand side will be a sub-expression
	// while the right-hand side will be the current element of Ops.
	Value *NewRHS = Ops[i].Op;
	if (NewRHS != Op->getOperand(1)) {
	DEBUG(dbgs() << "RA: " << *Op << '\n');
	if (NewRHS == Op->getOperand(0)) {
	// The new right-hand side was already present as the left operand. If
	// we are lucky then swapping the operands will sort out both of them.
	Op->swapOperands();
	} else {
	// Overwrite with the new right-hand side.
	BinaryOperator *BO = isReassociableOp(Op->getOperand(1), Opcode);
	if (BO && !NotRewritable.count(BO))
	NodesToRewrite.push_back(BO);
	Op->setOperand(1, NewRHS);
	ExpressionChanged = Op;
	}
	DEBUG(dbgs() << "TO: " << *Op << '\n');
	MadeChange = true;
	++NumChanged;
	}

	// Now deal with the left-hand side. If this is already an operation node
	// from the original expression then just rewrite the rest of the expression
	// into it.
	BinaryOperator *BO = isReassociableOp(Op->getOperand(0), Opcode);
	if (BO && !NotRewritable.count(BO)) {
	Op = BO;
	continue;
	}

	// Otherwise, grab a spare node from the original expression and use that as
	// the left-hand side. If there are no nodes left then the optimizers made
	// an expression with more nodes than the original! This usually means that
	// they did something stupid but it might mean that the problem was just too
	// hard (finding the mimimal number of multiplications needed to realize a
	// multiplication expression is NP-complete). Whatever the reason, smart or
	// stupid, create a new node if there are none left.
	BinaryOperator *NewOp;
	if (NodesToRewrite.empty()) {
	Constant *Undef = UndefValue::get(I->getType());
	NewOp = BinaryOperator::Create(Instruction::BinaryOps(Opcode),
	Undef, Undef, "", I);
	if (NewOp->getType()->isFPOrFPVectorTy())
	NewOp->setFastMathFlags(I->getFastMathFlags());
	} else {
	NewOp = NodesToRewrite.pop_back_val();
	}

	DEBUG(dbgs() << "RA: " << *Op << '\n');
	Op->setOperand(0, NewOp);
	DEBUG(dbgs() << "TO: " << *Op << '\n');
	ExpressionChanged = Op;
	MadeChange = true;
	++NumChanged;
	Op = NewOp;
	}

	// If the expression changed non-trivially then clear out all subclass data
	// starting from the operator specified in ExpressionChanged, and compactify
	// the operators to just before the expression root to guarantee that the
	// expression tree is dominated by all of Ops.
	if (ExpressionChanged)
	do {
	// Preserve FastMathFlags.
	if (isa<FPMathOperator>(I)) {
	FastMathFlags Flags = I->getFastMathFlags();
	ExpressionChanged->clearSubclassOptionalData();
	ExpressionChanged->setFastMathFlags(Flags);
	} else
	ExpressionChanged->clearSubclassOptionalData();

	if (ExpressionChanged == I)
	break;
	ExpressionChanged->moveBefore(I);
	ExpressionChanged = cast<BinaryOperator>(*ExpressionChanged->user_begin());
	} while (1);

	// Throw away any left over nodes from the original expression.
	for (unsigned i = 0, e = NodesToRewrite.size(); i != e; ++i)
	RedoInsts.insert(NodesToRewrite[i]);
	}

	/// Insert instructions before the instruction pointed to by BI,
	/// that computes the negative version of the value specified. The negative
	/// version of the value is returned, and BI is left pointing at the instruction
	/// that should be processed next by the reassociation pass.
	/// Also add intermediate instructions to the redo list that are modified while
	/// pushing the negates through adds. These will be revisited to see if
	/// additional opportunities have been exposed.
	static Value NegateValue(Value V, Instruction *BI,
	SetVector<AssertingVH<Instruction>> &ToRedo) {
	if (Constant *C = dyn_cast<Constant>(V)) {
	if (C->getType()->isFPOrFPVectorTy()) {
	return ConstantExpr::getFNeg(C);
	}
	return ConstantExpr::getNeg(C);
	}


	// We are trying to expose opportunity for reassociation. One of the things
	// that we want to do to achieve this is to push a negation as deep into an
	// expression chain as possible, to expose the add instructions. In practice,
	// this means that we turn this:
	// X = -(A+12+C+D) into X = -A + -12 + -C + -D = -12 + -A + -C + -D
	// so that later, a: Y = 12+X could get reassociated with the -12 to eliminate
	// the constants. We assume that instcombine will clean up the mess later if
	// we introduce tons of unnecessary negation instructions.
	//
	if (BinaryOperator *I =
	isReassociableOp(V, Instruction::Add, Instruction::FAdd)) {
	// Push the negates through the add.
	I->setOperand(0, NegateValue(I->getOperand(0), BI, ToRedo));
	I->setOperand(1, NegateValue(I->getOperand(1), BI, ToRedo));
	if (I->getOpcode() == Instruction::Add) {
	I->setHasNoUnsignedWrap(false);
	I->setHasNoSignedWrap(false);
	}

	// We must move the add instruction here, because the neg instructions do
	// not dominate the old add instruction in general. By moving it, we are
	// assured that the neg instructions we just inserted dominate the
	// instruction we are about to insert after them.
	//
	I->moveBefore(BI);
	I->setName(I->getName()+".neg");

	// Add the intermediate negates to the redo list as processing them later
	// could expose more reassociating opportunities.
	ToRedo.insert(I);
	return I;
	}

	// Okay, we need to materialize a negated version of V with an instruction.
	// Scan the use lists of V to see if we have one already.
	for (User *U : V->users()) {
	if (!BinaryOperator::isNeg(U) && !BinaryOperator::isFNeg(U))
	continue;

	// We found one! Now we have to make sure that the definition dominates
	// this use. We do this by moving it to the entry block (if it is a
	// non-instruction value) or right after the definition. These negates will
	// be zapped by reassociate later, so we don't need much finesse here.
	BinaryOperator *TheNeg = cast<BinaryOperator>(U);

	// Verify that the negate is in this function, V might be a constant expr.
	if (TheNeg->getParent()->getParent() != BI->getParent()->getParent())
	continue;

	BasicBlock::iterator InsertPt;
	if (Instruction *InstInput = dyn_cast<Instruction>(V)) {
	if (InvokeInst *II = dyn_cast<InvokeInst>(InstInput)) {
	InsertPt = II->getNormalDest()->begin();
	} else {
	InsertPt = ++InstInput->getIterator();
	}
	while (isa<PHINode>(InsertPt)) ++InsertPt;
	} else {
	InsertPt = TheNeg->getParent()->getParent()->getEntryBlock().begin();
	}
	TheNeg->moveBefore(&*InsertPt);
	if (TheNeg->getOpcode() == Instruction::Sub) {
	TheNeg->setHasNoUnsignedWrap(false);
	TheNeg->setHasNoSignedWrap(false);
	} else {
	TheNeg->andIRFlags(BI);
	}
	ToRedo.insert(TheNeg);
	return TheNeg;
	}

	// Insert a 'neg' instruction that subtracts the value from zero to get the
	// negation.
	BinaryOperator *NewNeg = CreateNeg(V, V->getName() + ".neg", BI, BI);
	ToRedo.insert(NewNeg);
	return NewNeg;
	}

	/// Return true if we should break up this subtract of X-Y into (X + -Y).
	static bool ShouldBreakUpSubtract(Instruction *Sub) {
	// If this is a negation, we can't split it up!
	if (BinaryOperator::isNeg(Sub) \|\| BinaryOperator::isFNeg(Sub))
	return false;

	// Don't breakup X - undef.
	if (isa<UndefValue>(Sub->getOperand(1)))
	return false;

	// Don't bother to break this up unless either the LHS is an associable add or
	// subtract or if this is only used by one.
	Value *V0 = Sub->getOperand(0);
	if (isReassociableOp(V0, Instruction::Add, Instruction::FAdd) \|\|
	isReassociableOp(V0, Instruction::Sub, Instruction::FSub))
	return true;
	Value *V1 = Sub->getOperand(1);
	if (isReassociableOp(V1, Instruction::Add, Instruction::FAdd) \|\|
	isReassociableOp(V1, Instruction::Sub, Instruction::FSub))
	return true;
	Value *VB = Sub->user_back();
	if (Sub->hasOneUse() &&
	(isReassociableOp(VB, Instruction::Add, Instruction::FAdd) \|\|
	isReassociableOp(VB, Instruction::Sub, Instruction::FSub)))
	return true;

	return false;
	}

	/// If we have (X-Y), and if either X is an add, or if this is only used by an
	/// add, transform this into (X+(0-Y)) to promote better reassociation.
	static BinaryOperator *
	BreakUpSubtract(Instruction *Sub, SetVector<AssertingVH<Instruction>> &ToRedo) {
	// Convert a subtract into an add and a neg instruction. This allows sub
	// instructions to be commuted with other add instructions.
	//
	// Calculate the negative value of Operand 1 of the sub instruction,
	// and set it as the RHS of the add instruction we just made.
	//
	Value *NegVal = NegateValue(Sub->getOperand(1), Sub, ToRedo);
	BinaryOperator *New = CreateAdd(Sub->getOperand(0), NegVal, "", Sub, Sub);
	Sub->setOperand(0, Constant::getNullValue(Sub->getType())); // Drop use of op.
	Sub->setOperand(1, Constant::getNullValue(Sub->getType())); // Drop use of op.
	New->takeName(Sub);

	// Everyone now refers to the add instruction.
	Sub->replaceAllUsesWith(New);
	New->setDebugLoc(Sub->getDebugLoc());

	DEBUG(dbgs() << "Negated: " << *New << '\n');
	return New;
	}

	/// If this is a shift of a reassociable multiply or is used by one, change
	/// this into a multiply by a constant to assist with further reassociation.
	static BinaryOperator ConvertShiftToMul(Instruction Shl) {
	Constant *MulCst = ConstantInt::get(Shl->getType(), 1);
	MulCst = ConstantExpr::getShl(MulCst, cast<Constant>(Shl->getOperand(1)));

	BinaryOperator *Mul =
	BinaryOperator::CreateMul(Shl->getOperand(0), MulCst, "", Shl);
	Shl->setOperand(0, UndefValue::get(Shl->getType())); // Drop use of op.
	Mul->takeName(Shl);

	// Everyone now refers to the mul instruction.
	Shl->replaceAllUsesWith(Mul);
	Mul->setDebugLoc(Shl->getDebugLoc());

	// We can safely preserve the nuw flag in all cases. It's also safe to turn a
	// nuw nsw shl into a nuw nsw mul. However, nsw in isolation requires special
	// handling.
	bool NSW = cast<BinaryOperator>(Shl)->hasNoSignedWrap();
	bool NUW = cast<BinaryOperator>(Shl)->hasNoUnsignedWrap();
	if (NSW && NUW)
	Mul->setHasNoSignedWrap(true);
	Mul->setHasNoUnsignedWrap(NUW);
	return Mul;
	}

	/// Scan backwards and forwards among values with the same rank as element i
	/// to see if X exists. If X does not exist, return i. This is useful when
	/// scanning for 'x' when we see '-x' because they both get the same rank.
	static unsigned FindInOperandList(SmallVectorImpl<ValueEntry> &Ops, unsigned i,
	Value *X) {
	unsigned XRank = Ops[i].Rank;
	unsigned e = Ops.size();
	for (unsigned j = i+1; j != e && Ops[j].Rank == XRank; ++j) {
	if (Ops[j].Op == X)
	return j;
	if (Instruction *I1 = dyn_cast<Instruction>(Ops[j].Op))
	if (Instruction *I2 = dyn_cast<Instruction>(X))
	if (I1->isIdenticalTo(I2))
	return j;
	}
	// Scan backwards.
	for (unsigned j = i-1; j != ~0U && Ops[j].Rank == XRank; --j) {
	if (Ops[j].Op == X)
	return j;
	if (Instruction *I1 = dyn_cast<Instruction>(Ops[j].Op))
	if (Instruction *I2 = dyn_cast<Instruction>(X))
	if (I1->isIdenticalTo(I2))
	return j;
	}
	return i;
	}

	/// Emit a tree of add instructions, summing Ops together
	/// and returning the result. Insert the tree before I.
	static Value EmitAddTreeOfValues(Instruction I,
	SmallVectorImpl<WeakVH> &Ops){
	if (Ops.size() == 1) return Ops.back();

	Value *V1 = Ops.back();
	Ops.pop_back();
	Value *V2 = EmitAddTreeOfValues(I, Ops);
	return CreateAdd(V2, V1, "tmp", I, I);
	}

	/// If V is an expression tree that is a multiplication sequence,
	/// and if this sequence contains a multiply by Factor,
	/// remove Factor from the tree and return the new tree.
	Value ReassociatePass::RemoveFactorFromExpression(Value V, Value *Factor) {
	BinaryOperator *BO = isReassociableOp(V, Instruction::Mul, Instruction::FMul);
	if (!BO)
	return nullptr;

	SmallVector<RepeatedValue, 8> Tree;
	MadeChange \|= LinearizeExprTree(BO, Tree);
	SmallVector<ValueEntry, 8> Factors;
	Factors.reserve(Tree.size());
	for (unsigned i = 0, e = Tree.size(); i != e; ++i) {
	RepeatedValue E = Tree[i];
	Factors.append(E.second.getZExtValue(),
	ValueEntry(getRank(E.first), E.first));
	}

	bool FoundFactor = false;
	bool NeedsNegate = false;
	for (unsigned i = 0, e = Factors.size(); i != e; ++i) {
	if (Factors[i].Op == Factor) {
	FoundFactor = true;
	Factors.erase(Factors.begin()+i);
	break;
	}

	// If this is a negative version of this factor, remove it.
	if (ConstantInt *FC1 = dyn_cast<ConstantInt>(Factor)) {
	if (ConstantInt *FC2 = dyn_cast<ConstantInt>(Factors[i].Op))
	if (FC1->getValue() == -FC2->getValue()) {
	FoundFactor = NeedsNegate = true;
	Factors.erase(Factors.begin()+i);
	break;
	}
	} else if (ConstantFP *FC1 = dyn_cast<ConstantFP>(Factor)) {
	if (ConstantFP *FC2 = dyn_cast<ConstantFP>(Factors[i].Op)) {
	const APFloat &F1 = FC1->getValueAPF();
	APFloat F2(FC2->getValueAPF());
	F2.changeSign();
	if (F1.compare(F2) == APFloat::cmpEqual) {
	FoundFactor = NeedsNegate = true;
	Factors.erase(Factors.begin() + i);
	break;
	}
	}
	}
	}

	if (!FoundFactor) {
	// Make sure to restore the operands to the expression tree.
	RewriteExprTree(BO, Factors);
	return nullptr;
	}

	BasicBlock::iterator InsertPt = ++BO->getIterator();

	// If this was just a single multiply, remove the multiply and return the only
	// remaining operand.
	if (Factors.size() == 1) {
	RedoInsts.insert(BO);
	V = Factors[0].Op;
	} else {
	RewriteExprTree(BO, Factors);
	V = BO;
	}

	if (NeedsNegate)
	V = CreateNeg(V, "neg", &*InsertPt, BO);

	return V;
	}

	/// If V is a single-use multiply, recursively add its operands as factors,
	/// otherwise add V to the list of factors.
	///
	/// Ops is the top-level list of add operands we're trying to factor.
	static void FindSingleUseMultiplyFactors(Value *V,
	SmallVectorImpl<Value*> &Factors,
	const SmallVectorImpl<ValueEntry> &Ops) {
	BinaryOperator *BO = isReassociableOp(V, Instruction::Mul, Instruction::FMul);
	if (!BO) {
	Factors.push_back(V);
	return;
	}

	// Otherwise, add the LHS and RHS to the list of factors.
	FindSingleUseMultiplyFactors(BO->getOperand(1), Factors, Ops);
	FindSingleUseMultiplyFactors(BO->getOperand(0), Factors, Ops);
	}

	/// Optimize a series of operands to an 'and', 'or', or 'xor' instruction.
	/// This optimizes based on identities. If it can be reduced to a single Value,
	/// it is returned, otherwise the Ops list is mutated as necessary.
	static Value *OptimizeAndOrXor(unsigned Opcode,
	SmallVectorImpl<ValueEntry> &Ops) {
	// Scan the operand lists looking for X and ~X pairs, along with X,X pairs.
	// If we find any, we can simplify the expression. X&~X == 0, X\|~X == -1.
	for (unsigned i = 0, e = Ops.size(); i != e; ++i) {
	// First, check for X and ~X in the operand list.
	assert(i < Ops.size());
	if (BinaryOperator::isNot(Ops[i].Op)) { // Cannot occur for ^.
	Value *X = BinaryOperator::getNotArgument(Ops[i].Op);
	unsigned FoundX = FindInOperandList(Ops, i, X);
	if (FoundX != i) {
	if (Opcode == Instruction::And) // ...&X&~X = 0
	return Constant::getNullValue(X->getType());

	if (Opcode == Instruction::Or) // ...\|X\|~X = -1
	return Constant::getAllOnesValue(X->getType());
	}
	}

	// Next, check for duplicate pairs of values, which we assume are next to
	// each other, due to our sorting criteria.
	assert(i < Ops.size());
	if (i+1 != Ops.size() && Ops[i+1].Op == Ops[i].Op) {
	if (Opcode == Instruction::And \|\| Opcode == Instruction::Or) {
	// Drop duplicate values for And and Or.
	Ops.erase(Ops.begin()+i);
	--i; --e;
	++NumAnnihil;
	continue;
	}

	// Drop pairs of values for Xor.
	assert(Opcode == Instruction::Xor);
	if (e == 2)
	return Constant::getNullValue(Ops[0].Op->getType());

	// Y ^ X^X -> Y
	Ops.erase(Ops.begin()+i, Ops.begin()+i+2);
	i -= 1; e -= 2;
	++NumAnnihil;
	}
	}
	return nullptr;
	}

	/// Helper function of CombineXorOpnd(). It creates a bitwise-and
	/// instruction with the given two operands, and return the resulting
	/// instruction. There are two special cases: 1) if the constant operand is 0,
	/// it will return NULL. 2) if the constant is ~0, the symbolic operand will
	/// be returned.
	static Value createAndInstr(Instruction InsertBefore, Value *Opnd,
	const APInt &ConstOpnd) {
	if (ConstOpnd != 0) {
	if (!ConstOpnd.isAllOnesValue()) {
	LLVMContext &Ctx = Opnd->getType()->getContext();
	Instruction *I;
	I = BinaryOperator::CreateAnd(Opnd, ConstantInt::get(Ctx, ConstOpnd),
	"and.ra", InsertBefore);
	I->setDebugLoc(InsertBefore->getDebugLoc());
	return I;
	}
	return Opnd;
	}
	return nullptr;
	}

	// Helper function of OptimizeXor(). It tries to simplify "Opnd1 ^ ConstOpnd"
	// into "R ^ C", where C would be 0, and R is a symbolic value.
	//
	// If it was successful, true is returned, and the "R" and "C" is returned
	// via "Res" and "ConstOpnd", respectively; otherwise, false is returned,
	// and both "Res" and "ConstOpnd" remain unchanged.
	//
	bool ReassociatePass::CombineXorOpnd(Instruction I, XorOpnd Opnd1,
	APInt &ConstOpnd, Value *&Res) {
	// Xor-Rule 1: (x \| c1) ^ c2 = (x \| c1) ^ (c1 ^ c1) ^ c2
	// = ((x \| c1) ^ c1) ^ (c1 ^ c2)
	// = (x & ~c1) ^ (c1 ^ c2)
	// It is useful only when c1 == c2.
	if (Opnd1->isOrExpr() && Opnd1->getConstPart() != 0) {
	if (!Opnd1->getValue()->hasOneUse())
	return false;

	const APInt &C1 = Opnd1->getConstPart();
	if (C1 != ConstOpnd)
	return false;

	Value *X = Opnd1->getSymbolicPart();
	Res = createAndInstr(I, X, ~C1);
	// ConstOpnd was C2, now C1 ^ C2.
	ConstOpnd ^= C1;

	if (Instruction *T = dyn_cast<Instruction>(Opnd1->getValue()))
	RedoInsts.insert(T);
	return true;
	}
	return false;
	}


	// Helper function of OptimizeXor(). It tries to simplify
	// "Opnd1 ^ Opnd2 ^ ConstOpnd" into "R ^ C", where C would be 0, and R is a
	// symbolic value.
	//
	// If it was successful, true is returned, and the "R" and "C" is returned
	// via "Res" and "ConstOpnd", respectively (If the entire expression is
	// evaluated to a constant, the Res is set to NULL); otherwise, false is
	// returned, and both "Res" and "ConstOpnd" remain unchanged.
	bool ReassociatePass::CombineXorOpnd(Instruction I, XorOpnd Opnd1,
	XorOpnd *Opnd2, APInt &ConstOpnd,
	Value *&Res) {
	Value *X = Opnd1->getSymbolicPart();
	if (X != Opnd2->getSymbolicPart())
	return false;

	// This many instruction become dead.(At least "Opnd1 ^ Opnd2" will die.)
	int DeadInstNum = 1;
	if (Opnd1->getValue()->hasOneUse())
	DeadInstNum++;
	if (Opnd2->getValue()->hasOneUse())
	DeadInstNum++;

	// Xor-Rule 2:
	// (x \| c1) ^ (x & c2)
	// = (x\|c1) ^ (x&c2) ^ (c1 ^ c1) = ((x\|c1) ^ c1) ^ (x & c2) ^ c1
	// = (x & ~c1) ^ (x & c2) ^ c1 // Xor-Rule 1
	// = (x & c3) ^ c1, where c3 = ~c1 ^ c2 // Xor-rule 3
	//
	if (Opnd1->isOrExpr() != Opnd2->isOrExpr()) {
	if (Opnd2->isOrExpr())
	std::swap(Opnd1, Opnd2);

	const APInt &C1 = Opnd1->getConstPart();
	const APInt &C2 = Opnd2->getConstPart();
	APInt C3((~C1) ^ C2);

	// Do not increase code size!
	if (C3 != 0 && !C3.isAllOnesValue()) {
	int NewInstNum = ConstOpnd != 0 ? 1 : 2;
	if (NewInstNum > DeadInstNum)
	return false;
	}

	Res = createAndInstr(I, X, C3);
	ConstOpnd ^= C1;

	} else if (Opnd1->isOrExpr()) {
	// Xor-Rule 3: (x \| c1) ^ (x \| c2) = (x & c3) ^ c3 where c3 = c1 ^ c2
	//
	const APInt &C1 = Opnd1->getConstPart();
	const APInt &C2 = Opnd2->getConstPart();
	APInt C3 = C1 ^ C2;

	// Do not increase code size
	if (C3 != 0 && !C3.isAllOnesValue()) {
	int NewInstNum = ConstOpnd != 0 ? 1 : 2;
	if (NewInstNum > DeadInstNum)
	return false;
	}

	Res = createAndInstr(I, X, C3);
	ConstOpnd ^= C3;
	} else {
	// Xor-Rule 4: (x & c1) ^ (x & c2) = (x & (c1^c2))
	//
	const APInt &C1 = Opnd1->getConstPart();
	const APInt &C2 = Opnd2->getConstPart();
	APInt C3 = C1 ^ C2;
	Res = createAndInstr(I, X, C3);
	}

	// Put the original operands in the Redo list; hope they will be deleted
	// as dead code.
	if (Instruction *T = dyn_cast<Instruction>(Opnd1->getValue()))
	RedoInsts.insert(T);
	if (Instruction *T = dyn_cast<Instruction>(Opnd2->getValue()))
	RedoInsts.insert(T);

	return true;
	}

	/// Optimize a series of operands to an 'xor' instruction. If it can be reduced
	/// to a single Value, it is returned, otherwise the Ops list is mutated as
	/// necessary.
	Value ReassociatePass::OptimizeXor(Instruction I,
	SmallVectorImpl<ValueEntry> &Ops) {
	if (Value *V = OptimizeAndOrXor(Instruction::Xor, Ops))
	return V;

	if (Ops.size() == 1)
	return nullptr;

	SmallVector<XorOpnd, 8> Opnds;
	SmallVector<XorOpnd*, 8> OpndPtrs;
	Type *Ty = Ops[0].Op->getType();
	APInt ConstOpnd(Ty->getIntegerBitWidth(), 0);

	// Step 1: Convert ValueEntry to XorOpnd
	for (unsigned i = 0, e = Ops.size(); i != e; ++i) {
	Value *V = Ops[i].Op;
	if (!isa<ConstantInt>(V)) {
	XorOpnd O(V);
	O.setSymbolicRank(getRank(O.getSymbolicPart()));
	Opnds.push_back(O);
	} else
	ConstOpnd ^= cast<ConstantInt>(V)->getValue();
	}

	// NOTE: From this point on, do NOT add/delete element to/from "Opnds".
	// It would otherwise invalidate the "Opnds"'s iterator, and hence invalidate
	// the "OpndPtrs" as well. For the similar reason, do not fuse this loop
	// with the previous loop --- the iterator of the "Opnds" may be invalidated
	// when new elements are added to the vector.
	for (unsigned i = 0, e = Opnds.size(); i != e; ++i)
	OpndPtrs.push_back(&Opnds[i]);

	// Step 2: Sort the Xor-Operands in a way such that the operands containing
	// the same symbolic value cluster together. For instance, the input operand
	// sequence ("x \| 123", "y & 456", "x & 789") will be sorted into:
	// ("x \| 123", "x & 789", "y & 456").
	//
	// The purpose is twofold:
	// 1) Cluster together the operands sharing the same symbolic-value.
	// 2) Operand having smaller symbolic-value-rank is permuted earlier, which
	// could potentially shorten crital path, and expose more loop-invariants.
	// Note that values' rank are basically defined in RPO order (FIXME).
	// So, if Rank(X) < Rank(Y) < Rank(Z), it means X is defined earlier
	// than Y which is defined earlier than Z. Permute "x \| 1", "Y & 2",
	// "z" in the order of X-Y-Z is better than any other orders.
	std::stable_sort(OpndPtrs.begin(), OpndPtrs.end(),
	[](XorOpnd LHS, XorOpnd RHS) {
	return LHS->getSymbolicRank() < RHS->getSymbolicRank();
	});

	// Step 3: Combine adjacent operands
	XorOpnd *PrevOpnd = nullptr;
	bool Changed = false;
	for (unsigned i = 0, e = Opnds.size(); i < e; i++) {
	XorOpnd *CurrOpnd = OpndPtrs[i];
	// The combined value
	Value *CV;

	// Step 3.1: Try simplifying "CurrOpnd ^ ConstOpnd"
	if (ConstOpnd != 0 && CombineXorOpnd(I, CurrOpnd, ConstOpnd, CV)) {
	Changed = true;
	if (CV)
	*CurrOpnd = XorOpnd(CV);
	else {
	CurrOpnd->Invalidate();
	continue;
	}
	}

	if (!PrevOpnd \|\| CurrOpnd->getSymbolicPart() != PrevOpnd->getSymbolicPart()) {
	PrevOpnd = CurrOpnd;
	continue;
	}

	// step 3.2: When previous and current operands share the same symbolic
	// value, try to simplify "PrevOpnd ^ CurrOpnd ^ ConstOpnd"
	//
	if (CombineXorOpnd(I, CurrOpnd, PrevOpnd, ConstOpnd, CV)) {
	// Remove previous operand
	PrevOpnd->Invalidate();
	if (CV) {
	*CurrOpnd = XorOpnd(CV);
	PrevOpnd = CurrOpnd;
	} else {
	CurrOpnd->Invalidate();
	PrevOpnd = nullptr;
	}
	Changed = true;
	}
	}

	// Step 4: Reassemble the Ops
	if (Changed) {
	Ops.clear();
	for (unsigned int i = 0, e = Opnds.size(); i < e; i++) {
	XorOpnd &O = Opnds[i];
	if (O.isInvalid())
	continue;
	ValueEntry VE(getRank(O.getValue()), O.getValue());
	Ops.push_back(VE);
	}
	if (ConstOpnd != 0) {
	Value *C = ConstantInt::get(Ty->getContext(), ConstOpnd);
	ValueEntry VE(getRank(C), C);
	Ops.push_back(VE);
	}
	int Sz = Ops.size();
	if (Sz == 1)
	return Ops.back().Op;
	else if (Sz == 0) {
	assert(ConstOpnd == 0);
	return ConstantInt::get(Ty->getContext(), ConstOpnd);
	}
	}

	return nullptr;
	}

	/// Optimize a series of operands to an 'add' instruction. This
	/// optimizes based on identities. If it can be reduced to a single Value, it
	/// is returned, otherwise the Ops list is mutated as necessary.
	Value ReassociatePass::OptimizeAdd(Instruction I,
	SmallVectorImpl<ValueEntry> &Ops) {
	// Scan the operand lists looking for X and -X pairs. If we find any, we
	// can simplify expressions like X+-X == 0 and X+~X ==-1. While we're at it,
	// scan for any
	// duplicates. We want to canonicalize Y+Y+Y+Z -> 3*Y+Z.

	for (unsigned i = 0, e = Ops.size(); i != e; ++i) {
	Value *TheOp = Ops[i].Op;
	// Check to see if we've seen this operand before. If so, we factor all
	// instances of the operand together. Due to our sorting criteria, we know
	// that these need to be next to each other in the vector.
	if (i+1 != Ops.size() && Ops[i+1].Op == TheOp) {
	// Rescan the list, remove all instances of this operand from the expr.
	unsigned NumFound = 0;
	do {
	Ops.erase(Ops.begin()+i);
	++NumFound;
	} while (i != Ops.size() && Ops[i].Op == TheOp);

	DEBUG(dbgs() << "\nFACTORING [" << NumFound << "]: " << *TheOp << '\n');
	++NumFactor;

	// Insert a new multiply.
	Type *Ty = TheOp->getType();
	Constant *C = Ty->isIntOrIntVectorTy() ?
	ConstantInt::get(Ty, NumFound) : ConstantFP::get(Ty, NumFound);
	Instruction *Mul = CreateMul(TheOp, C, "factor", I, I);

	// Now that we have inserted a multiply, optimize it. This allows us to
	// handle cases that require multiple factoring steps, such as this:
	// (X2) + (X2) + (X2) -> (X2)3 -> X6
	RedoInsts.insert(Mul);

	// If every add operand was a duplicate, return the multiply.
	if (Ops.empty())
	return Mul;

	// Otherwise, we had some input that didn't have the dupe, such as
	// "A + A + B" -> "A*2 + B". Add the new multiply to the list of
	// things being added by this operation.
	Ops.insert(Ops.begin(), ValueEntry(getRank(Mul), Mul));

	--i;
	e = Ops.size();
	continue;
	}

	// Check for X and -X or X and ~X in the operand list.
	if (!BinaryOperator::isNeg(TheOp) && !BinaryOperator::isFNeg(TheOp) &&
	!BinaryOperator::isNot(TheOp))
	continue;

	Value *X = nullptr;
	if (BinaryOperator::isNeg(TheOp) \|\| BinaryOperator::isFNeg(TheOp))
	X = BinaryOperator::getNegArgument(TheOp);
	else if (BinaryOperator::isNot(TheOp))
	X = BinaryOperator::getNotArgument(TheOp);

	unsigned FoundX = FindInOperandList(Ops, i, X);
	if (FoundX == i)
	continue;

	// Remove X and -X from the operand list.
	if (Ops.size() == 2 &&
	(BinaryOperator::isNeg(TheOp) \|\| BinaryOperator::isFNeg(TheOp)))
	return Constant::getNullValue(X->getType());

	// Remove X and ~X from the operand list.
	if (Ops.size() == 2 && BinaryOperator::isNot(TheOp))
	return Constant::getAllOnesValue(X->getType());

	Ops.erase(Ops.begin()+i);
	if (i < FoundX)
	--FoundX;
	else
	--i; // Need to back up an extra one.
	Ops.erase(Ops.begin()+FoundX);
	++NumAnnihil;
	--i; // Revisit element.
	e -= 2; // Removed two elements.

	// if X and ~X we append -1 to the operand list.
	if (BinaryOperator::isNot(TheOp)) {
	Value *V = Constant::getAllOnesValue(X->getType());
	Ops.insert(Ops.end(), ValueEntry(getRank(V), V));
	e += 1;
	}
	}

	// Scan the operand list, checking to see if there are any common factors
	// between operands. Consider something like AA+AB*C+D. We would like to
	// reassociate this to A(A+BC)+D, which reduces the number of multiplies.
	// To efficiently find this, we count the number of times a factor occurs
	// for any ADD operands that are MULs.
	DenseMap<Value*, unsigned> FactorOccurrences;

	// Keep track of each multiply we see, to avoid triggering on (X4)+(X4)
	// where they are actually the same multiply.
	unsigned MaxOcc = 0;
	Value *MaxOccVal = nullptr;
	for (unsigned i = 0, e = Ops.size(); i != e; ++i) {
	BinaryOperator *BOp =
	isReassociableOp(Ops[i].Op, Instruction::Mul, Instruction::FMul);
	if (!BOp)
	continue;

	// Compute all of the factors of this added value.
	SmallVector<Value*, 8> Factors;
	FindSingleUseMultiplyFactors(BOp, Factors, Ops);
	assert(Factors.size() > 1 && "Bad linearize!");

	// Add one to FactorOccurrences for each unique factor in this op.
	SmallPtrSet<Value*, 8> Duplicates;
	for (unsigned i = 0, e = Factors.size(); i != e; ++i) {
	Value *Factor = Factors[i];
	if (!Duplicates.insert(Factor).second)
	continue;

	unsigned Occ = ++FactorOccurrences[Factor];
	if (Occ > MaxOcc) {
	MaxOcc = Occ;
	MaxOccVal = Factor;
	}

	// If Factor is a negative constant, add the negated value as a factor
	// because we can percolate the negate out. Watch for minint, which
	// cannot be positivified.
	if (ConstantInt *CI = dyn_cast<ConstantInt>(Factor)) {
	if (CI->isNegative() && !CI->isMinValue(true)) {
	Factor = ConstantInt::get(CI->getContext(), -CI->getValue());
	- assert(!Duplicates.count(Factor) &&
	- "Shouldn't have two constant factors, missed a canonicalize");
	+ if (!Duplicates.insert(Factor).second)
	+ continue;
	unsigned Occ = ++FactorOccurrences[Factor];
	if (Occ > MaxOcc) {
	MaxOcc = Occ;
	MaxOccVal = Factor;
	}
	}
	} else if (ConstantFP *CF = dyn_cast<ConstantFP>(Factor)) {
	if (CF->isNegative()) {
	APFloat F(CF->getValueAPF());
	F.changeSign();
	Factor = ConstantFP::get(CF->getContext(), F);
	- assert(!Duplicates.count(Factor) &&
	- "Shouldn't have two constant factors, missed a canonicalize");
	+ if (!Duplicates.insert(Factor).second)
	+ continue;
	unsigned Occ = ++FactorOccurrences[Factor];
	if (Occ > MaxOcc) {
	MaxOcc = Occ;
	MaxOccVal = Factor;
	}
	}
	}
	}
	}

	// If any factor occurred more than one time, we can pull it out.
	if (MaxOcc > 1) {
	DEBUG(dbgs() << "\nFACTORING [" << MaxOcc << "]: " << *MaxOccVal << '\n');
	++NumFactor;

	// Create a new instruction that uses the MaxOccVal twice. If we don't do
	// this, we could otherwise run into situations where removing a factor
	// from an expression will drop a use of maxocc, and this can cause
	// RemoveFactorFromExpression on successive values to behave differently.
	Instruction *DummyInst =
	I->getType()->isIntOrIntVectorTy()
	? BinaryOperator::CreateAdd(MaxOccVal, MaxOccVal)
	: BinaryOperator::CreateFAdd(MaxOccVal, MaxOccVal);

	SmallVector<WeakVH, 4> NewMulOps;
	for (unsigned i = 0; i != Ops.size(); ++i) {
	// Only try to remove factors from expressions we're allowed to.
	BinaryOperator *BOp =
	isReassociableOp(Ops[i].Op, Instruction::Mul, Instruction::FMul);
	if (!BOp)
	continue;

	if (Value *V = RemoveFactorFromExpression(Ops[i].Op, MaxOccVal)) {
	// The factorized operand may occur several times. Convert them all in
	// one fell swoop.
	for (unsigned j = Ops.size(); j != i;) {
	--j;
	if (Ops[j].Op == Ops[i].Op) {
	NewMulOps.push_back(V);
	Ops.erase(Ops.begin()+j);
	}
	}
	--i;
	}
	}

	// No need for extra uses anymore.
	delete DummyInst;

	unsigned NumAddedValues = NewMulOps.size();
	Value *V = EmitAddTreeOfValues(I, NewMulOps);

	// Now that we have inserted the add tree, optimize it. This allows us to
	// handle cases that require multiple factoring steps, such as this:
	// AAB + AAC --> A(AB+AC) --> A(A*(B+C))
	assert(NumAddedValues > 1 && "Each occurrence should contribute a value");
	(void)NumAddedValues;
	if (Instruction *VI = dyn_cast<Instruction>(V))
	RedoInsts.insert(VI);

	// Create the multiply.
	Instruction *V2 = CreateMul(V, MaxOccVal, "tmp", I, I);

	// Rerun associate on the multiply in case the inner expression turned into
	// a multiply. We want to make sure that we keep things in canonical form.
	RedoInsts.insert(V2);

	// If every add operand included the factor (e.g. "AB + AC"), then the
	// entire result expression is just the multiply "A*(B+C)".
	if (Ops.empty())
	return V2;

	// Otherwise, we had some input that didn't have the factor, such as
	// "AB + AC + D" -> "A*(B+C) + D". Add the new multiply to the list of
	// things being added by this operation.
	Ops.insert(Ops.begin(), ValueEntry(getRank(V2), V2));
	}

	return nullptr;
	}

	/// \brief Build up a vector of value/power pairs factoring a product.
	///
	/// Given a series of multiplication operands, build a vector of factors and
	/// the powers each is raised to when forming the final product. Sort them in
	/// the order of descending power.
	///
	/// (x*x) -> [(x, 2)]
	/// ((xx)x) -> [(x, 3)]
	/// ((((xy)x)y)x) -> [(x, 3), (y, 2)]
	///
	/// \returns Whether any factors have a power greater than one.
	bool ReassociatePass::collectMultiplyFactors(SmallVectorImpl<ValueEntry> &Ops,
	SmallVectorImpl<Factor> &Factors) {
	// FIXME: Have Ops be (ValueEntry, Multiplicity) pairs, simplifying this.
	// Compute the sum of powers of simplifiable factors.
	unsigned FactorPowerSum = 0;
	for (unsigned Idx = 1, Size = Ops.size(); Idx < Size; ++Idx) {
	Value *Op = Ops[Idx-1].Op;

	// Count the number of occurrences of this value.
	unsigned Count = 1;
	for (; Idx < Size && Ops[Idx].Op == Op; ++Idx)
	++Count;
	// Track for simplification all factors which occur 2 or more times.
	if (Count > 1)
	FactorPowerSum += Count;
	}

	// We can only simplify factors if the sum of the powers of our simplifiable
	// factors is 4 or higher. When that is the case, we will always have
	// a simplification. This is an important invariant to prevent cyclicly
	// trying to simplify already minimal formations.
	if (FactorPowerSum < 4)
	return false;

	// Now gather the simplifiable factors, removing them from Ops.
	FactorPowerSum = 0;
	for (unsigned Idx = 1; Idx < Ops.size(); ++Idx) {
	Value *Op = Ops[Idx-1].Op;

	// Count the number of occurrences of this value.
	unsigned Count = 1;
	for (; Idx < Ops.size() && Ops[Idx].Op == Op; ++Idx)
	++Count;
	if (Count == 1)
	continue;
	// Move an even number of occurrences to Factors.
	Count &= ~1U;
	Idx -= Count;
	FactorPowerSum += Count;
	Factors.push_back(Factor(Op, Count));
	Ops.erase(Ops.begin()+Idx, Ops.begin()+Idx+Count);
	}

	// None of the adjustments above should have reduced the sum of factor powers
	// below our mininum of '4'.
	assert(FactorPowerSum >= 4);

	std::stable_sort(Factors.begin(), Factors.end(),
	[](const Factor &LHS, const Factor &RHS) {
	return LHS.Power > RHS.Power;
	});
	return true;
	}

	/// \brief Build a tree of multiplies, computing the product of Ops.
	static Value *buildMultiplyTree(IRBuilder<> &Builder,
	SmallVectorImpl<Value*> &Ops) {
	if (Ops.size() == 1)
	return Ops.back();

	Value *LHS = Ops.pop_back_val();
	do {
	if (LHS->getType()->isIntOrIntVectorTy())
	LHS = Builder.CreateMul(LHS, Ops.pop_back_val());
	else
	LHS = Builder.CreateFMul(LHS, Ops.pop_back_val());
	} while (!Ops.empty());

	return LHS;
	}

	/// \brief Build a minimal multiplication DAG for (a^x)(b^y)(c^z)*...
	///
	/// Given a vector of values raised to various powers, where no two values are
	/// equal and the powers are sorted in decreasing order, compute the minimal
	/// DAG of multiplies to compute the final product, and return that product
	/// value.
	Value *
	ReassociatePass::buildMinimalMultiplyDAG(IRBuilder<> &Builder,
	SmallVectorImpl<Factor> &Factors) {
	assert(Factors[0].Power);
	SmallVector<Value *, 4> OuterProduct;
	for (unsigned LastIdx = 0, Idx = 1, Size = Factors.size();
	Idx < Size && Factors[Idx].Power > 0; ++Idx) {
	if (Factors[Idx].Power != Factors[LastIdx].Power) {
	LastIdx = Idx;
	continue;
	}

	// We want to multiply across all the factors with the same power so that
	// we can raise them to that power as a single entity. Build a mini tree
	// for that.
	SmallVector<Value *, 4> InnerProduct;
	InnerProduct.push_back(Factors[LastIdx].Base);
	do {
	InnerProduct.push_back(Factors[Idx].Base);
	++Idx;
	} while (Idx < Size && Factors[Idx].Power == Factors[LastIdx].Power);

	// Reset the base value of the first factor to the new expression tree.
	// We'll remove all the factors with the same power in a second pass.
	Value *M = Factors[LastIdx].Base = buildMultiplyTree(Builder, InnerProduct);
	if (Instruction *MI = dyn_cast<Instruction>(M))
	RedoInsts.insert(MI);

	LastIdx = Idx;
	}
	// Unique factors with equal powers -- we've folded them into the first one's
	// base.
	Factors.erase(std::unique(Factors.begin(), Factors.end(),
	[](const Factor &LHS, const Factor &RHS) {
	return LHS.Power == RHS.Power;
	}),
	Factors.end());

	// Iteratively collect the base of each factor with an add power into the
	// outer product, and halve each power in preparation for squaring the
	// expression.
	for (unsigned Idx = 0, Size = Factors.size(); Idx != Size; ++Idx) {
	if (Factors[Idx].Power & 1)
	OuterProduct.push_back(Factors[Idx].Base);
	Factors[Idx].Power >>= 1;
	}
	if (Factors[0].Power) {
	Value *SquareRoot = buildMinimalMultiplyDAG(Builder, Factors);
	OuterProduct.push_back(SquareRoot);
	OuterProduct.push_back(SquareRoot);
	}
	if (OuterProduct.size() == 1)
	return OuterProduct.front();

	Value *V = buildMultiplyTree(Builder, OuterProduct);
	return V;
	}

	Value ReassociatePass::OptimizeMul(BinaryOperator I,
	SmallVectorImpl<ValueEntry> &Ops) {
	// We can only optimize the multiplies when there is a chain of more than
	// three, such that a balanced tree might require fewer total multiplies.
	if (Ops.size() < 4)
	return nullptr;

	// Try to turn linear trees of multiplies without other uses of the
	// intermediate stages into minimal multiply DAGs with perfect sub-expression
	// re-use.
	SmallVector<Factor, 4> Factors;
	if (!collectMultiplyFactors(Ops, Factors))
	return nullptr; // All distinct factors, so nothing left for us to do.

	IRBuilder<> Builder(I);
	// The reassociate transformation for FP operations is performed only
	// if unsafe algebra is permitted by FastMathFlags. Propagate those flags
	// to the newly generated operations.
	if (auto FPI = dyn_cast<FPMathOperator>(I))
	Builder.setFastMathFlags(FPI->getFastMathFlags());

	Value *V = buildMinimalMultiplyDAG(Builder, Factors);
	if (Ops.empty())
	return V;

	ValueEntry NewEntry = ValueEntry(getRank(V), V);
	Ops.insert(std::lower_bound(Ops.begin(), Ops.end(), NewEntry), NewEntry);
	return nullptr;
	}

	Value ReassociatePass::OptimizeExpression(BinaryOperator I,
	SmallVectorImpl<ValueEntry> &Ops) {
	// Now that we have the linearized expression tree, try to optimize it.
	// Start by folding any constants that we found.
	Constant *Cst = nullptr;
	unsigned Opcode = I->getOpcode();
	while (!Ops.empty() && isa<Constant>(Ops.back().Op)) {
	Constant *C = cast<Constant>(Ops.pop_back_val().Op);
	Cst = Cst ? ConstantExpr::get(Opcode, C, Cst) : C;
	}
	// If there was nothing but constants then we are done.
	if (Ops.empty())
	return Cst;

	// Put the combined constant back at the end of the operand list, except if
	// there is no point. For example, an add of 0 gets dropped here, while a
	// multiplication by zero turns the whole expression into zero.
	if (Cst && Cst != ConstantExpr::getBinOpIdentity(Opcode, I->getType())) {
	if (Cst == ConstantExpr::getBinOpAbsorber(Opcode, I->getType()))
	return Cst;
	Ops.push_back(ValueEntry(0, Cst));
	}

	if (Ops.size() == 1) return Ops[0].Op;

	// Handle destructive annihilation due to identities between elements in the
	// argument list here.
	unsigned NumOps = Ops.size();
	switch (Opcode) {
	default: break;
	case Instruction::And:
	case Instruction::Or:
	if (Value *Result = OptimizeAndOrXor(Opcode, Ops))
	return Result;
	break;

	case Instruction::Xor:
	if (Value *Result = OptimizeXor(I, Ops))
	return Result;
	break;

	case Instruction::Add:
	case Instruction::FAdd:
	if (Value *Result = OptimizeAdd(I, Ops))
	return Result;
	break;

	case Instruction::Mul:
	case Instruction::FMul:
	if (Value *Result = OptimizeMul(I, Ops))
	return Result;
	break;
	}

	if (Ops.size() != NumOps)
	return OptimizeExpression(I, Ops);
	return nullptr;
	}

	// Remove dead instructions and if any operands are trivially dead add them to
	// Insts so they will be removed as well.
	void ReassociatePass::RecursivelyEraseDeadInsts(
	Instruction *I, SetVector<AssertingVH<Instruction>> &Insts) {
	assert(isInstructionTriviallyDead(I) && "Trivially dead instructions only!");
	SmallVector<Value *, 4> Ops(I->op_begin(), I->op_end());
	ValueRankMap.erase(I);
	Insts.remove(I);
	RedoInsts.remove(I);
	I->eraseFromParent();
	for (auto Op : Ops)
	if (Instruction *OpInst = dyn_cast<Instruction>(Op))
	if (OpInst->use_empty())
	Insts.insert(OpInst);
	}

	/// Zap the given instruction, adding interesting operands to the work list.
	void ReassociatePass::EraseInst(Instruction *I) {
	assert(isInstructionTriviallyDead(I) && "Trivially dead instructions only!");
	DEBUG(dbgs() << "Erasing dead inst: "; I->dump());

	SmallVector<Value*, 8> Ops(I->op_begin(), I->op_end());
	// Erase the dead instruction.
	ValueRankMap.erase(I);
	RedoInsts.remove(I);
	I->eraseFromParent();
	// Optimize its operands.
	SmallPtrSet<Instruction *, 8> Visited; // Detect self-referential nodes.
	for (unsigned i = 0, e = Ops.size(); i != e; ++i)
	if (Instruction *Op = dyn_cast<Instruction>(Ops[i])) {
	// If this is a node in an expression tree, climb to the expression root
	// and add that since that's where optimization actually happens.
	unsigned Opcode = Op->getOpcode();
	while (Op->hasOneUse() && Op->user_back()->getOpcode() == Opcode &&
	Visited.insert(Op).second)
	Op = Op->user_back();
	RedoInsts.insert(Op);
	}
	}

	// Canonicalize expressions of the following form:
	// x + (-Constant * y) -> x - (Constant * y)
	// x - (-Constant * y) -> x + (Constant * y)
	Instruction ReassociatePass::canonicalizeNegConstExpr(Instruction I) {
	if (!I->hasOneUse() \|\| I->getType()->isVectorTy())
	return nullptr;

	// Must be a fmul or fdiv instruction.
	unsigned Opcode = I->getOpcode();
	if (Opcode != Instruction::FMul && Opcode != Instruction::FDiv)
	return nullptr;

	auto *C0 = dyn_cast<ConstantFP>(I->getOperand(0));
	auto *C1 = dyn_cast<ConstantFP>(I->getOperand(1));

	// Both operands are constant, let it get constant folded away.
	if (C0 && C1)
	return nullptr;

	ConstantFP *CF = C0 ? C0 : C1;

	// Must have one constant operand.
	if (!CF)
	return nullptr;

	// Must be a negative ConstantFP.
	if (!CF->isNegative())
	return nullptr;

	// User must be a binary operator with one or more uses.
	Instruction *User = I->user_back();
	if (!isa<BinaryOperator>(User) \|\| !User->hasNUsesOrMore(1))
	return nullptr;

	unsigned UserOpcode = User->getOpcode();
	if (UserOpcode != Instruction::FAdd && UserOpcode != Instruction::FSub)
	return nullptr;

	// Subtraction is not commutative. Explicitly, the following transform is
	// not valid: (-Constant * y) - x -> x + (Constant * y)
	if (!User->isCommutative() && User->getOperand(1) != I)
	return nullptr;

	// Change the sign of the constant.
	APFloat Val = CF->getValueAPF();
	Val.changeSign();
	I->setOperand(C0 ? 0 : 1, ConstantFP::get(CF->getContext(), Val));

	// Canonicalize I to RHS to simplify the next bit of logic. E.g.,
	// ((-Consty) + x) -> (x + (-Consty)).
	if (User->getOperand(0) == I && User->isCommutative())
	cast<BinaryOperator>(User)->swapOperands();

	Value *Op0 = User->getOperand(0);
	Value *Op1 = User->getOperand(1);
	BinaryOperator *NI;
	switch (UserOpcode) {
	default:
	llvm_unreachable("Unexpected Opcode!");
	case Instruction::FAdd:
	NI = BinaryOperator::CreateFSub(Op0, Op1);
	NI->setFastMathFlags(cast<FPMathOperator>(User)->getFastMathFlags());
	break;
	case Instruction::FSub:
	NI = BinaryOperator::CreateFAdd(Op0, Op1);
	NI->setFastMathFlags(cast<FPMathOperator>(User)->getFastMathFlags());
	break;
	}

	NI->insertBefore(User);
	NI->setName(User->getName());
	User->replaceAllUsesWith(NI);
	NI->setDebugLoc(I->getDebugLoc());
	RedoInsts.insert(I);
	MadeChange = true;
	return NI;
	}

	/// Inspect and optimize the given instruction. Note that erasing
	/// instructions is not allowed.
	void ReassociatePass::OptimizeInst(Instruction *I) {
	// Only consider operations that we understand.
	if (!isa<BinaryOperator>(I))
	return;

	if (I->getOpcode() == Instruction::Shl && isa<ConstantInt>(I->getOperand(1)))
	// If an operand of this shift is a reassociable multiply, or if the shift
	// is used by a reassociable multiply or add, turn into a multiply.
	if (isReassociableOp(I->getOperand(0), Instruction::Mul) \|\|
	(I->hasOneUse() &&
	(isReassociableOp(I->user_back(), Instruction::Mul) \|\|
	isReassociableOp(I->user_back(), Instruction::Add)))) {
	Instruction *NI = ConvertShiftToMul(I);
	RedoInsts.insert(I);
	MadeChange = true;
	I = NI;
	}

	// Canonicalize negative constants out of expressions.
	if (Instruction *Res = canonicalizeNegConstExpr(I))
	I = Res;

	// Commute binary operators, to canonicalize the order of their operands.
	// This can potentially expose more CSE opportunities, and makes writing other
	// transformations simpler.
	if (I->isCommutative())
	canonicalizeOperands(I);

	// TODO: We should optimize vector Xor instructions, but they are
	// currently unsupported.
	if (I->getType()->isVectorTy() && I->getOpcode() == Instruction::Xor)
	return;

	// Don't optimize floating point instructions that don't have unsafe algebra.
	if (I->getType()->isFPOrFPVectorTy() && !I->hasUnsafeAlgebra())
	return;

	// Do not reassociate boolean (i1) expressions. We want to preserve the
	// original order of evaluation for short-circuited comparisons that
	// SimplifyCFG has folded to AND/OR expressions. If the expression
	// is not further optimized, it is likely to be transformed back to a
	// short-circuited form for code gen, and the source order may have been
	// optimized for the most likely conditions.
	if (I->getType()->isIntegerTy(1))
	return;

	// If this is a subtract instruction which is not already in negate form,
	// see if we can convert it to X+-Y.
	if (I->getOpcode() == Instruction::Sub) {
	if (ShouldBreakUpSubtract(I)) {
	Instruction *NI = BreakUpSubtract(I, RedoInsts);
	RedoInsts.insert(I);
	MadeChange = true;
	I = NI;
	} else if (BinaryOperator::isNeg(I)) {
	// Otherwise, this is a negation. See if the operand is a multiply tree
	// and if this is not an inner node of a multiply tree.
	if (isReassociableOp(I->getOperand(1), Instruction::Mul) &&
	(!I->hasOneUse() \|\|
	!isReassociableOp(I->user_back(), Instruction::Mul))) {
	Instruction *NI = LowerNegateToMultiply(I);
	// If the negate was simplified, revisit the users to see if we can
	// reassociate further.
	for (User *U : NI->users()) {
	if (BinaryOperator *Tmp = dyn_cast<BinaryOperator>(U))
	RedoInsts.insert(Tmp);
	}
	RedoInsts.insert(I);
	MadeChange = true;
	I = NI;
	}
	}
	} else if (I->getOpcode() == Instruction::FSub) {
	if (ShouldBreakUpSubtract(I)) {
	Instruction *NI = BreakUpSubtract(I, RedoInsts);
	RedoInsts.insert(I);
	MadeChange = true;
	I = NI;
	} else if (BinaryOperator::isFNeg(I)) {
	// Otherwise, this is a negation. See if the operand is a multiply tree
	// and if this is not an inner node of a multiply tree.
	if (isReassociableOp(I->getOperand(1), Instruction::FMul) &&
	(!I->hasOneUse() \|\|
	!isReassociableOp(I->user_back(), Instruction::FMul))) {
	// If the negate was simplified, revisit the users to see if we can
	// reassociate further.
	Instruction *NI = LowerNegateToMultiply(I);
	for (User *U : NI->users()) {
	if (BinaryOperator *Tmp = dyn_cast<BinaryOperator>(U))
	RedoInsts.insert(Tmp);
	}
	RedoInsts.insert(I);
	MadeChange = true;
	I = NI;
	}
	}
	}

	// If this instruction is an associative binary operator, process it.
	if (!I->isAssociative()) return;
	BinaryOperator *BO = cast<BinaryOperator>(I);

	// If this is an interior node of a reassociable tree, ignore it until we
	// get to the root of the tree, to avoid N^2 analysis.
	unsigned Opcode = BO->getOpcode();
	if (BO->hasOneUse() && BO->user_back()->getOpcode() == Opcode) {
	// During the initial run we will get to the root of the tree.
	// But if we get here while we are redoing instructions, there is no
	// guarantee that the root will be visited. So Redo later
	if (BO->user_back() != BO &&
	BO->getParent() == BO->user_back()->getParent())
	RedoInsts.insert(BO->user_back());
	return;
	}

	// If this is an add tree that is used by a sub instruction, ignore it
	// until we process the subtract.
	if (BO->hasOneUse() && BO->getOpcode() == Instruction::Add &&
	cast<Instruction>(BO->user_back())->getOpcode() == Instruction::Sub)
	return;
	if (BO->hasOneUse() && BO->getOpcode() == Instruction::FAdd &&
	cast<Instruction>(BO->user_back())->getOpcode() == Instruction::FSub)
	return;

	ReassociateExpression(BO);
	}

	void ReassociatePass::ReassociateExpression(BinaryOperator *I) {
	// First, walk the expression tree, linearizing the tree, collecting the
	// operand information.
	SmallVector<RepeatedValue, 8> Tree;
	MadeChange \|= LinearizeExprTree(I, Tree);
	SmallVector<ValueEntry, 8> Ops;
	Ops.reserve(Tree.size());
	for (unsigned i = 0, e = Tree.size(); i != e; ++i) {
	RepeatedValue E = Tree[i];
	Ops.append(E.second.getZExtValue(),
	ValueEntry(getRank(E.first), E.first));
	}

	DEBUG(dbgs() << "RAIn:\t"; PrintOps(I, Ops); dbgs() << '\n');

	// Now that we have linearized the tree to a list and have gathered all of
	// the operands and their ranks, sort the operands by their rank. Use a
	// stable_sort so that values with equal ranks will have their relative
	// positions maintained (and so the compiler is deterministic). Note that
	// this sorts so that the highest ranking values end up at the beginning of
	// the vector.
	std::stable_sort(Ops.begin(), Ops.end());

	// Now that we have the expression tree in a convenient
	// sorted form, optimize it globally if possible.
	if (Value *V = OptimizeExpression(I, Ops)) {
	if (V == I)
	// Self-referential expression in unreachable code.
	return;
	// This expression tree simplified to something that isn't a tree,
	// eliminate it.
	DEBUG(dbgs() << "Reassoc to scalar: " << *V << '\n');
	I->replaceAllUsesWith(V);
	if (Instruction *VI = dyn_cast<Instruction>(V))
	VI->setDebugLoc(I->getDebugLoc());
	RedoInsts.insert(I);
	++NumAnnihil;
	return;
	}

	// We want to sink immediates as deeply as possible except in the case where
	// this is a multiply tree used only by an add, and the immediate is a -1.
	// In this case we reassociate to put the negation on the outside so that we
	// can fold the negation into the add: (-X)Y + Z -> Z-XY
	if (I->hasOneUse()) {
	if (I->getOpcode() == Instruction::Mul &&
	cast<Instruction>(I->user_back())->getOpcode() == Instruction::Add &&
	isa<ConstantInt>(Ops.back().Op) &&
	cast<ConstantInt>(Ops.back().Op)->isAllOnesValue()) {
	ValueEntry Tmp = Ops.pop_back_val();
	Ops.insert(Ops.begin(), Tmp);
	} else if (I->getOpcode() == Instruction::FMul &&
	cast<Instruction>(I->user_back())->getOpcode() ==
	Instruction::FAdd &&
	isa<ConstantFP>(Ops.back().Op) &&
	cast<ConstantFP>(Ops.back().Op)->isExactlyValue(-1.0)) {
	ValueEntry Tmp = Ops.pop_back_val();
	Ops.insert(Ops.begin(), Tmp);
	}
	}

	DEBUG(dbgs() << "RAOut:\t"; PrintOps(I, Ops); dbgs() << '\n');

	if (Ops.size() == 1) {
	if (Ops[0].Op == I)
	// Self-referential expression in unreachable code.
	return;

	// This expression tree simplified to something that isn't a tree,
	// eliminate it.
	I->replaceAllUsesWith(Ops[0].Op);
	if (Instruction *OI = dyn_cast<Instruction>(Ops[0].Op))
	OI->setDebugLoc(I->getDebugLoc());
	RedoInsts.insert(I);
	return;
	}

	// Now that we ordered and optimized the expressions, splat them back into
	// the expression tree, removing any unneeded nodes.
	RewriteExprTree(I, Ops);
	}

	PreservedAnalyses ReassociatePass::run(Function &F, FunctionAnalysisManager &) {
	// Get the functions basic blocks in Reverse Post Order. This order is used by
	// BuildRankMap to pre calculate ranks correctly. It also excludes dead basic
	// blocks (it has been seen that the analysis in this pass could hang when
	// analysing dead basic blocks).
	ReversePostOrderTraversal<Function *> RPOT(&F);

	// Calculate the rank map for F.
	BuildRankMap(F, RPOT);

	MadeChange = false;
	// Traverse the same blocks that was analysed by BuildRankMap.
	for (BasicBlock *BI : RPOT) {
	assert(RankMap.count(&*BI) && "BB should be ranked.");
	// Optimize every instruction in the basic block.
	for (BasicBlock::iterator II = BI->begin(), IE = BI->end(); II != IE;)
	if (isInstructionTriviallyDead(&*II)) {
	EraseInst(&*II++);
	} else {
	OptimizeInst(&*II);
	assert(II->getParent() == &*BI && "Moved to a different block!");
	++II;
	}

	// Make a copy of all the instructions to be redone so we can remove dead
	// instructions.
	SetVector<AssertingVH<Instruction>> ToRedo(RedoInsts);
	// Iterate over all instructions to be reevaluated and remove trivially dead
	// instructions. If any operand of the trivially dead instruction becomes
	// dead mark it for deletion as well. Continue this process until all
	// trivially dead instructions have been removed.
	while (!ToRedo.empty()) {
	Instruction *I = ToRedo.pop_back_val();
	if (isInstructionTriviallyDead(I)) {
	RecursivelyEraseDeadInsts(I, ToRedo);
	MadeChange = true;
	}
	}

	// Now that we have removed dead instructions, we can reoptimize the
	// remaining instructions.
	while (!RedoInsts.empty()) {
	Instruction *I = RedoInsts.pop_back_val();
	if (isInstructionTriviallyDead(I))
	EraseInst(I);
	else
	OptimizeInst(I);
	}
	}

	// We are done with the rank map.
	RankMap.clear();
	ValueRankMap.clear();

	if (MadeChange) {
	// FIXME: This should also 'preserve the CFG'.
	auto PA = PreservedAnalyses();
	PA.preserve<GlobalsAA>();
	return PA;
	}

	return PreservedAnalyses::all();
	}

	namespace {
	class ReassociateLegacyPass : public FunctionPass {
	ReassociatePass Impl;
	public:
	static char ID; // Pass identification, replacement for typeid
	ReassociateLegacyPass() : FunctionPass(ID) {
	initializeReassociateLegacyPassPass(*PassRegistry::getPassRegistry());
	}

	bool runOnFunction(Function &F) override {
	if (skipFunction(F))
	return false;

	FunctionAnalysisManager DummyFAM;
	auto PA = Impl.run(F, DummyFAM);
	return !PA.areAllPreserved();
	}

	void getAnalysisUsage(AnalysisUsage &AU) const override {
	AU.setPreservesCFG();
	AU.addPreserved<GlobalsAAWrapperPass>();
	}
	};
	}

	char ReassociateLegacyPass::ID = 0;
	INITIALIZE_PASS(ReassociateLegacyPass, "reassociate",
	"Reassociate expressions", false, false)

	// Public interface to the Reassociate pass
	FunctionPass *llvm::createReassociatePass() {
	return new ReassociateLegacyPass();
	}
	Index: projects/clang400-import/contrib/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
	===================================================================
	--- projects/clang400-import/contrib/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp (revision 314268)
	+++ projects/clang400-import/contrib/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp (revision 314269)
	@@ -1,4945 +1,4854 @@
	//===- SLPVectorizer.cpp - A bottom up SLP Vectorizer ---------------------===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	// This pass implements the Bottom Up SLP vectorizer. It detects consecutive
	// stores that can be put together into vector-stores. Next, it attempts to
	// construct vectorizable tree using the use-def chains. If a profitable tree
	// was found, the SLP vectorizer performs vectorization on the tree.
	//
	// The pass is inspired by the work described in the paper:
	// "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks.
	//
	//===----------------------------------------------------------------------===//
	#include "llvm/Transforms/Vectorize/SLPVectorizer.h"
	#include "llvm/ADT/Optional.h"
	#include "llvm/ADT/PostOrderIterator.h"
	#include "llvm/ADT/SetVector.h"
	#include "llvm/ADT/Statistic.h"
	#include "llvm/Analysis/CodeMetrics.h"
	#include "llvm/Analysis/GlobalsModRef.h"
	#include "llvm/Analysis/LoopAccessAnalysis.h"
	#include "llvm/Analysis/ScalarEvolutionExpressions.h"
	#include "llvm/Analysis/ValueTracking.h"
	#include "llvm/Analysis/VectorUtils.h"
	#include "llvm/IR/DataLayout.h"
	#include "llvm/IR/Dominators.h"
	#include "llvm/IR/IRBuilder.h"
	#include "llvm/IR/Instructions.h"
	#include "llvm/IR/IntrinsicInst.h"
	#include "llvm/IR/Module.h"
	#include "llvm/IR/NoFolder.h"
	#include "llvm/IR/Type.h"
	#include "llvm/IR/Value.h"
	#include "llvm/IR/Verifier.h"
	#include "llvm/Pass.h"
	#include "llvm/Support/CommandLine.h"
	#include "llvm/Support/Debug.h"
	#include "llvm/Support/raw_ostream.h"
	#include "llvm/Transforms/Vectorize.h"
	#include <algorithm>
	#include <memory>

	using namespace llvm;
	using namespace slpvectorizer;

	#define SV_NAME "slp-vectorizer"
	#define DEBUG_TYPE "SLP"

	STATISTIC(NumVectorInstructions, "Number of vector instructions generated");

	static cl::opt<int>
	SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden,
	cl::desc("Only vectorize if you gain more than this "
	"number "));

	static cl::opt<bool>
	ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden,
	cl::desc("Attempt to vectorize horizontal reductions"));

	static cl::opt<bool> ShouldStartVectorizeHorAtStore(
	"slp-vectorize-hor-store", cl::init(false), cl::Hidden,
	cl::desc(
	"Attempt to vectorize horizontal reductions feeding into a store"));

	static cl::opt<int>
	MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden,
	cl::desc("Attempt to vectorize for this register size in bits"));

	/// Limits the size of scheduling regions in a block.
	/// It avoid long compile times for _very_ large blocks where vector
	/// instructions are spread over a wide range.
	/// This limit is way higher than needed by real-world functions.
	static cl::opt<int>
	ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden,
	cl::desc("Limit the size of the SLP scheduling region per block"));

	static cl::opt<int> MinVectorRegSizeOption(
	"slp-min-reg-size", cl::init(128), cl::Hidden,
	cl::desc("Attempt to vectorize for this register size in bits"));

	static cl::opt<unsigned> RecursionMaxDepth(
	"slp-recursion-max-depth", cl::init(12), cl::Hidden,
	cl::desc("Limit the recursion depth when building a vectorizable tree"));

	static cl::opt<unsigned> MinTreeSize(
	"slp-min-tree-size", cl::init(3), cl::Hidden,
	cl::desc("Only vectorize small trees if they are fully vectorizable"));

	// Limit the number of alias checks. The limit is chosen so that
	// it has no negative effect on the llvm benchmarks.
	static const unsigned AliasedCheckLimit = 10;

	// Another limit for the alias checks: The maximum distance between load/store
	// instructions where alias checks are done.
	// This limit is useful for very large basic blocks.
	static const unsigned MaxMemDepDistance = 160;

	/// If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling
	/// regions to be handled.
	static const int MinScheduleRegionSize = 16;

	/// \brief Predicate for the element types that the SLP vectorizer supports.
	///
	/// The most important thing to filter here are types which are invalid in LLVM
	/// vectors. We also filter target specific types which have absolutely no
	/// meaningful vectorization path such as x86_fp80 and ppc_f128. This just
	/// avoids spending time checking the cost model and realizing that they will
	/// be inevitably scalarized.
	static bool isValidElementType(Type *Ty) {
	return VectorType::isValidElementType(Ty) && !Ty->isX86_FP80Ty() &&
	!Ty->isPPC_FP128Ty();
	}

	/// \returns true if all of the instructions in \p VL are in the same block or
	/// false otherwise.
	static bool allSameBlock(ArrayRef<Value *> VL) {
	Instruction *I0 = dyn_cast<Instruction>(VL[0]);
	if (!I0)
	return false;
	BasicBlock *BB = I0->getParent();
	for (int i = 1, e = VL.size(); i < e; i++) {
	Instruction *I = dyn_cast<Instruction>(VL[i]);
	if (!I)
	return false;

	if (BB != I->getParent())
	return false;
	}
	return true;
	}

	/// \returns True if all of the values in \p VL are constants.
	static bool allConstant(ArrayRef<Value *> VL) {
	for (Value *i : VL)
	if (!isa<Constant>(i))
	return false;
	return true;
	}

	/// \returns True if all of the values in \p VL are identical.
	static bool isSplat(ArrayRef<Value *> VL) {
	for (unsigned i = 1, e = VL.size(); i < e; ++i)
	if (VL[i] != VL[0])
	return false;
	return true;
	}

	///\returns Opcode that can be clubbed with \p Op to create an alternate
	/// sequence which can later be merged as a ShuffleVector instruction.
	static unsigned getAltOpcode(unsigned Op) {
	switch (Op) {
	case Instruction::FAdd:
	return Instruction::FSub;
	case Instruction::FSub:
	return Instruction::FAdd;
	case Instruction::Add:
	return Instruction::Sub;
	case Instruction::Sub:
	return Instruction::Add;
	default:
	return 0;
	}
	}

	///\returns bool representing if Opcode \p Op can be part
	/// of an alternate sequence which can later be merged as
	/// a ShuffleVector instruction.
	static bool canCombineAsAltInst(unsigned Op) {
	return Op == Instruction::FAdd \|\| Op == Instruction::FSub \|\|
	Op == Instruction::Sub \|\| Op == Instruction::Add;
	}

	/// \returns ShuffleVector instruction if instructions in \p VL have
	/// alternate fadd,fsub / fsub,fadd/add,sub/sub,add sequence.
	/// (i.e. e.g. opcodes of fadd,fsub,fadd,fsub...)
	static unsigned isAltInst(ArrayRef<Value *> VL) {
	Instruction *I0 = dyn_cast<Instruction>(VL[0]);
	unsigned Opcode = I0->getOpcode();
	unsigned AltOpcode = getAltOpcode(Opcode);
	for (int i = 1, e = VL.size(); i < e; i++) {
	Instruction *I = dyn_cast<Instruction>(VL[i]);
	if (!I \|\| I->getOpcode() != ((i & 1) ? AltOpcode : Opcode))
	return 0;
	}
	return Instruction::ShuffleVector;
	}

	/// \returns The opcode if all of the Instructions in \p VL have the same
	/// opcode, or zero.
	static unsigned getSameOpcode(ArrayRef<Value *> VL) {
	Instruction *I0 = dyn_cast<Instruction>(VL[0]);
	if (!I0)
	return 0;
	unsigned Opcode = I0->getOpcode();
	for (int i = 1, e = VL.size(); i < e; i++) {
	Instruction *I = dyn_cast<Instruction>(VL[i]);
	if (!I \|\| Opcode != I->getOpcode()) {
	if (canCombineAsAltInst(Opcode) && i == 1)
	return isAltInst(VL);
	return 0;
	}
	}
	return Opcode;
	}

	/// Get the intersection (logical and) of all of the potential IR flags
	/// of each scalar operation (VL) that will be converted into a vector (I).
	/// Flag set: NSW, NUW, exact, and all of fast-math.
	static void propagateIRFlags(Value I, ArrayRef<Value > VL) {
	if (auto *VecOp = dyn_cast<Instruction>(I)) {
	if (auto *Intersection = dyn_cast<Instruction>(VL[0])) {
	// Intersection is initialized to the 0th scalar,
	// so start counting from index '1'.
	for (int i = 1, e = VL.size(); i < e; ++i) {
	if (auto *Scalar = dyn_cast<Instruction>(VL[i]))
	Intersection->andIRFlags(Scalar);
	}
	VecOp->copyIRFlags(Intersection);
	}
	}
	}

	/// \returns true if all of the values in \p VL have the same type or false
	/// otherwise.
	static bool allSameType(ArrayRef<Value *> VL) {
	Type *Ty = VL[0]->getType();
	for (int i = 1, e = VL.size(); i < e; i++)
	if (VL[i]->getType() != Ty)
	return false;

	return true;
	}

	/// \returns True if Extract{Value,Element} instruction extracts element Idx.
	static bool matchExtractIndex(Instruction *E, unsigned Idx, unsigned Opcode) {
	assert(Opcode == Instruction::ExtractElement \|\|
	Opcode == Instruction::ExtractValue);
	if (Opcode == Instruction::ExtractElement) {
	ConstantInt *CI = dyn_cast<ConstantInt>(E->getOperand(1));
	return CI && CI->getZExtValue() == Idx;
	} else {
	ExtractValueInst *EI = cast<ExtractValueInst>(E);
	return EI->getNumIndices() == 1 && *EI->idx_begin() == Idx;
	}
	}

	/// \returns True if in-tree use also needs extract. This refers to
	/// possible scalar operand in vectorized instruction.
	static bool InTreeUserNeedToExtract(Value Scalar, Instruction UserInst,
	TargetLibraryInfo *TLI) {

	unsigned Opcode = UserInst->getOpcode();
	switch (Opcode) {
	case Instruction::Load: {
	LoadInst *LI = cast<LoadInst>(UserInst);
	return (LI->getPointerOperand() == Scalar);
	}
	case Instruction::Store: {
	StoreInst *SI = cast<StoreInst>(UserInst);
	return (SI->getPointerOperand() == Scalar);
	}
	case Instruction::Call: {
	CallInst *CI = cast<CallInst>(UserInst);
	Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
	if (hasVectorInstrinsicScalarOpd(ID, 1)) {
	return (CI->getArgOperand(1) == Scalar);
	}
	}
	default:
	return false;
	}
	}

	/// \returns the AA location that is being access by the instruction.
	static MemoryLocation getLocation(Instruction I, AliasAnalysis AA) {
	if (StoreInst *SI = dyn_cast<StoreInst>(I))
	return MemoryLocation::get(SI);
	if (LoadInst *LI = dyn_cast<LoadInst>(I))
	return MemoryLocation::get(LI);
	return MemoryLocation();
	}

	/// \returns True if the instruction is not a volatile or atomic load/store.
	static bool isSimple(Instruction *I) {
	if (LoadInst *LI = dyn_cast<LoadInst>(I))
	return LI->isSimple();
	if (StoreInst *SI = dyn_cast<StoreInst>(I))
	return SI->isSimple();
	if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I))
	return !MI->isVolatile();
	return true;
	}

	namespace llvm {
	namespace slpvectorizer {
	/// Bottom Up SLP Vectorizer.
	class BoUpSLP {
	public:
	typedef SmallVector<Value *, 8> ValueList;
	typedef SmallVector<Instruction *, 16> InstrList;
	typedef SmallPtrSet<Value *, 16> ValueSet;
	typedef SmallVector<StoreInst *, 8> StoreList;

	BoUpSLP(Function Func, ScalarEvolution Se, TargetTransformInfo *Tti,
	TargetLibraryInfo TLi, AliasAnalysis Aa, LoopInfo *Li,
	DominatorTree Dt, AssumptionCache AC, DemandedBits *DB,
	const DataLayout *DL)
	: NumLoadsWantToKeepOrder(0), NumLoadsWantToChangeOrder(0), F(Func),
	SE(Se), TTI(Tti), TLI(TLi), AA(Aa), LI(Li), DT(Dt), AC(AC), DB(DB),
	DL(DL), Builder(Se->getContext()) {
	CodeMetrics::collectEphemeralValues(F, AC, EphValues);
	// Use the vector register size specified by the target unless overridden
	// by a command-line option.
	// TODO: It would be better to limit the vectorization factor based on
	// data type rather than just register size. For example, x86 AVX has
	// 256-bit registers, but it does not support integer operations
	// at that width (that requires AVX2).
	if (MaxVectorRegSizeOption.getNumOccurrences())
	MaxVecRegSize = MaxVectorRegSizeOption;
	else
	MaxVecRegSize = TTI->getRegisterBitWidth(true);

	MinVecRegSize = MinVectorRegSizeOption;
	}

	/// \brief Vectorize the tree that starts with the elements in \p VL.
	/// Returns the vectorized root.
	Value *vectorizeTree();

	/// \returns the cost incurred by unwanted spills and fills, caused by
	/// holding live values over call sites.
	int getSpillCost();

	/// \returns the vectorization cost of the subtree that starts at \p VL.
	/// A negative number means that this is profitable.
	int getTreeCost();

	/// Construct a vectorizable tree that starts at \p Roots, ignoring users for
	/// the purpose of scheduling and extraction in the \p UserIgnoreLst.
	void buildTree(ArrayRef<Value *> Roots,
	ArrayRef<Value *> UserIgnoreLst = None);

	/// Clear the internal data structures that are created by 'buildTree'.
	void deleteTree() {
	VectorizableTree.clear();
	ScalarToTreeEntry.clear();
	MustGather.clear();
	ExternalUses.clear();
	NumLoadsWantToKeepOrder = 0;
	NumLoadsWantToChangeOrder = 0;
	for (auto &Iter : BlocksSchedules) {
	BlockScheduling *BS = Iter.second.get();
	BS->clear();
	}
	MinBWs.clear();
	}

	/// \brief Perform LICM and CSE on the newly generated gather sequences.
	void optimizeGatherSequence();

	/// \returns true if it is beneficial to reverse the vector order.
	bool shouldReorder() const {
	return NumLoadsWantToChangeOrder > NumLoadsWantToKeepOrder;
	}

	/// \return The vector element size in bits to use when vectorizing the
	/// expression tree ending at \p V. If V is a store, the size is the width of
	/// the stored value. Otherwise, the size is the width of the largest loaded
	/// value reaching V. This method is used by the vectorizer to calculate
	/// vectorization factors.
	unsigned getVectorElementSize(Value *V);

	/// Compute the minimum type sizes required to represent the entries in a
	/// vectorizable tree.
	void computeMinimumValueSizes();

	// \returns maximum vector register size as set by TTI or overridden by cl::opt.
	unsigned getMaxVecRegSize() const {
	return MaxVecRegSize;
	}

	// \returns minimum vector register size as set by cl::opt.
	unsigned getMinVecRegSize() const {
	return MinVecRegSize;
	}

	/// \brief Check if ArrayType or StructType is isomorphic to some VectorType.
	///
	/// \returns number of elements in vector if isomorphism exists, 0 otherwise.
	unsigned canMapToVector(Type *T, const DataLayout &DL) const;

	/// \returns True if the VectorizableTree is both tiny and not fully
	/// vectorizable. We do not vectorize such trees.
	bool isTreeTinyAndNotFullyVectorizable();

	private:
	struct TreeEntry;

	/// \returns the cost of the vectorizable entry.
	int getEntryCost(TreeEntry *E);

	/// This is the recursive part of buildTree.
	void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth);

	/// \returns True if the ExtractElement/ExtractValue instructions in VL can
	/// be vectorized to use the original vector (or aggregate "bitcast" to a vector).
	bool canReuseExtract(ArrayRef<Value *> VL, unsigned Opcode) const;

	/// Vectorize a single entry in the tree.
	Value vectorizeTree(TreeEntry E);

	/// Vectorize a single entry in the tree, starting in \p VL.
	Value vectorizeTree(ArrayRef<Value > VL);

	/// \returns the pointer to the vectorized value if \p VL is already
	/// vectorized, or NULL. They may happen in cycles.
	Value alreadyVectorized(ArrayRef<Value > VL) const;

	/// \returns the scalarization cost for this type. Scalarization in this
	/// context means the creation of vectors from a group of scalars.
	int getGatherCost(Type *Ty);

	/// \returns the scalarization cost for this list of values. Assuming that
	/// this subtree gets vectorized, we may need to extract the values from the
	/// roots. This method calculates the cost of extracting the values.
	int getGatherCost(ArrayRef<Value *> VL);

	/// \brief Set the Builder insert point to one after the last instruction in
	/// the bundle
	void setInsertPointAfterBundle(ArrayRef<Value *> VL);

	/// \returns a vector from a collection of scalars in \p VL.
	Value Gather(ArrayRef<Value > VL, VectorType *Ty);

	/// \returns whether the VectorizableTree is fully vectorizable and will
	/// be beneficial even the tree height is tiny.
	bool isFullyVectorizableTinyTree();

	/// \reorder commutative operands in alt shuffle if they result in
	/// vectorized code.
	void reorderAltShuffleOperands(ArrayRef<Value *> VL,
	SmallVectorImpl<Value *> &Left,
	SmallVectorImpl<Value *> &Right);
	/// \reorder commutative operands to get better probability of
	/// generating vectorized code.
	void reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
	SmallVectorImpl<Value *> &Left,
	SmallVectorImpl<Value *> &Right);
	struct TreeEntry {
	TreeEntry() : Scalars(), VectorizedValue(nullptr),
	NeedToGather(0) {}

	/// \returns true if the scalars in VL are equal to this entry.
	bool isSame(ArrayRef<Value *> VL) const {
	assert(VL.size() == Scalars.size() && "Invalid size");
	return std::equal(VL.begin(), VL.end(), Scalars.begin());
	}

	/// A vector of scalars.
	ValueList Scalars;

	/// The Scalars are vectorized into this value. It is initialized to Null.
	Value *VectorizedValue;

	/// Do we need to gather this sequence ?
	bool NeedToGather;
	};

	/// Create a new VectorizableTree entry.
	TreeEntry newTreeEntry(ArrayRef<Value > VL, bool Vectorized) {
	VectorizableTree.emplace_back();
	int idx = VectorizableTree.size() - 1;
	TreeEntry *Last = &VectorizableTree[idx];
	Last->Scalars.insert(Last->Scalars.begin(), VL.begin(), VL.end());
	Last->NeedToGather = !Vectorized;
	if (Vectorized) {
	for (int i = 0, e = VL.size(); i != e; ++i) {
	assert(!ScalarToTreeEntry.count(VL[i]) && "Scalar already in tree!");
	ScalarToTreeEntry[VL[i]] = idx;
	}
	} else {
	MustGather.insert(VL.begin(), VL.end());
	}
	return Last;
	}

	/// -- Vectorization State --
	/// Holds all of the tree entries.
	std::vector<TreeEntry> VectorizableTree;

	/// Maps a specific scalar to its tree entry.
	SmallDenseMap<Value*, int> ScalarToTreeEntry;

	/// A list of scalars that we found that we need to keep as scalars.
	ValueSet MustGather;

	/// This POD struct describes one external user in the vectorized tree.
	struct ExternalUser {
	ExternalUser (Value S, llvm::User U, int L) :
	Scalar(S), User(U), Lane(L){}
	// Which scalar in our function.
	Value *Scalar;
	// Which user that uses the scalar.
	llvm::User *User;
	// Which lane does the scalar belong to.
	int Lane;
	};
	typedef SmallVector<ExternalUser, 16> UserList;

	/// Checks if two instructions may access the same memory.
	///
	/// \p Loc1 is the location of \p Inst1. It is passed explicitly because it
	/// is invariant in the calling loop.
	bool isAliased(const MemoryLocation &Loc1, Instruction *Inst1,
	Instruction *Inst2) {

	// First check if the result is already in the cache.
	AliasCacheKey key = std::make_pair(Inst1, Inst2);
	Optional<bool> &result = AliasCache[key];
	if (result.hasValue()) {
	return result.getValue();
	}
	MemoryLocation Loc2 = getLocation(Inst2, AA);
	bool aliased = true;
	if (Loc1.Ptr && Loc2.Ptr && isSimple(Inst1) && isSimple(Inst2)) {
	// Do the alias check.
	aliased = AA->alias(Loc1, Loc2);
	}
	// Store the result in the cache.
	result = aliased;
	return aliased;
	}

	typedef std::pair<Instruction , Instruction > AliasCacheKey;

	/// Cache for alias results.
	/// TODO: consider moving this to the AliasAnalysis itself.
	DenseMap<AliasCacheKey, Optional<bool>> AliasCache;

	/// Removes an instruction from its block and eventually deletes it.
	/// It's like Instruction::eraseFromParent() except that the actual deletion
	/// is delayed until BoUpSLP is destructed.
	/// This is required to ensure that there are no incorrect collisions in the
	/// AliasCache, which can happen if a new instruction is allocated at the
	/// same address as a previously deleted instruction.
	void eraseInstruction(Instruction *I) {
	I->removeFromParent();
	I->dropAllReferences();
	DeletedInstructions.push_back(std::unique_ptr<Instruction>(I));
	}

	/// Temporary store for deleted instructions. Instructions will be deleted
	/// eventually when the BoUpSLP is destructed.
	SmallVector<std::unique_ptr<Instruction>, 8> DeletedInstructions;

	/// A list of values that need to extracted out of the tree.
	/// This list holds pairs of (Internal Scalar : External User).
	UserList ExternalUses;

	/// Values used only by @llvm.assume calls.
	SmallPtrSet<const Value *, 32> EphValues;

	/// Holds all of the instructions that we gathered.
	SetVector<Instruction *> GatherSeq;
	/// A list of blocks that we are going to CSE.
	SetVector<BasicBlock *> CSEBlocks;

	/// Contains all scheduling relevant data for an instruction.
	/// A ScheduleData either represents a single instruction or a member of an
	/// instruction bundle (= a group of instructions which is combined into a
	/// vector instruction).
	struct ScheduleData {

	// The initial value for the dependency counters. It means that the
	// dependencies are not calculated yet.
	enum { InvalidDeps = -1 };

	ScheduleData()
	: Inst(nullptr), FirstInBundle(nullptr), NextInBundle(nullptr),
	NextLoadStore(nullptr), SchedulingRegionID(0), SchedulingPriority(0),
	Dependencies(InvalidDeps), UnscheduledDeps(InvalidDeps),
	UnscheduledDepsInBundle(InvalidDeps), IsScheduled(false) {}

	void init(int BlockSchedulingRegionID) {
	FirstInBundle = this;
	NextInBundle = nullptr;
	NextLoadStore = nullptr;
	IsScheduled = false;
	SchedulingRegionID = BlockSchedulingRegionID;
	UnscheduledDepsInBundle = UnscheduledDeps;
	clearDependencies();
	}

	/// Returns true if the dependency information has been calculated.
	bool hasValidDependencies() const { return Dependencies != InvalidDeps; }

	/// Returns true for single instructions and for bundle representatives
	/// (= the head of a bundle).
	bool isSchedulingEntity() const { return FirstInBundle == this; }

	/// Returns true if it represents an instruction bundle and not only a
	/// single instruction.
	bool isPartOfBundle() const {
	return NextInBundle != nullptr \|\| FirstInBundle != this;
	}

	/// Returns true if it is ready for scheduling, i.e. it has no more
	/// unscheduled depending instructions/bundles.
	bool isReady() const {
	assert(isSchedulingEntity() &&
	"can't consider non-scheduling entity for ready list");
	return UnscheduledDepsInBundle == 0 && !IsScheduled;
	}

	/// Modifies the number of unscheduled dependencies, also updating it for
	/// the whole bundle.
	int incrementUnscheduledDeps(int Incr) {
	UnscheduledDeps += Incr;
	return FirstInBundle->UnscheduledDepsInBundle += Incr;
	}

	/// Sets the number of unscheduled dependencies to the number of
	/// dependencies.
	void resetUnscheduledDeps() {
	incrementUnscheduledDeps(Dependencies - UnscheduledDeps);
	}

	/// Clears all dependency information.
	void clearDependencies() {
	Dependencies = InvalidDeps;
	resetUnscheduledDeps();
	MemoryDependencies.clear();
	}

	void dump(raw_ostream &os) const {
	if (!isSchedulingEntity()) {
	os << "/ " << *Inst;
	} else if (NextInBundle) {
	os << '[' << *Inst;
	ScheduleData *SD = NextInBundle;
	while (SD) {
	os << ';' << *SD->Inst;
	SD = SD->NextInBundle;
	}
	os << ']';
	} else {
	os << *Inst;
	}
	}

	Instruction *Inst;

	/// Points to the head in an instruction bundle (and always to this for
	/// single instructions).
	ScheduleData *FirstInBundle;

	/// Single linked list of all instructions in a bundle. Null if it is a
	/// single instruction.
	ScheduleData *NextInBundle;

	/// Single linked list of all memory instructions (e.g. load, store, call)
	/// in the block - until the end of the scheduling region.
	ScheduleData *NextLoadStore;

	/// The dependent memory instructions.
	/// This list is derived on demand in calculateDependencies().
	SmallVector<ScheduleData *, 4> MemoryDependencies;

	/// This ScheduleData is in the current scheduling region if this matches
	/// the current SchedulingRegionID of BlockScheduling.
	int SchedulingRegionID;

	/// Used for getting a "good" final ordering of instructions.
	int SchedulingPriority;

	/// The number of dependencies. Constitutes of the number of users of the
	/// instruction plus the number of dependent memory instructions (if any).
	/// This value is calculated on demand.
	/// If InvalidDeps, the number of dependencies is not calculated yet.
	///
	int Dependencies;

	/// The number of dependencies minus the number of dependencies of scheduled
	/// instructions. As soon as this is zero, the instruction/bundle gets ready
	/// for scheduling.
	/// Note that this is negative as long as Dependencies is not calculated.
	int UnscheduledDeps;

	/// The sum of UnscheduledDeps in a bundle. Equals to UnscheduledDeps for
	/// single instructions.
	int UnscheduledDepsInBundle;

	/// True if this instruction is scheduled (or considered as scheduled in the
	/// dry-run).
	bool IsScheduled;
	};

	#ifndef NDEBUG
	friend inline raw_ostream &operator<<(raw_ostream &os,
	const BoUpSLP::ScheduleData &SD) {
	SD.dump(os);
	return os;
	}
	#endif

	/// Contains all scheduling data for a basic block.
	///
	struct BlockScheduling {

	BlockScheduling(BasicBlock *BB)
	: BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize),
	ScheduleStart(nullptr), ScheduleEnd(nullptr),
	FirstLoadStoreInRegion(nullptr), LastLoadStoreInRegion(nullptr),
	ScheduleRegionSize(0),
	ScheduleRegionSizeLimit(ScheduleRegionSizeBudget),
	// Make sure that the initial SchedulingRegionID is greater than the
	// initial SchedulingRegionID in ScheduleData (which is 0).
	SchedulingRegionID(1) {}

	void clear() {
	ReadyInsts.clear();
	ScheduleStart = nullptr;
	ScheduleEnd = nullptr;
	FirstLoadStoreInRegion = nullptr;
	LastLoadStoreInRegion = nullptr;

	// Reduce the maximum schedule region size by the size of the
	// previous scheduling run.
	ScheduleRegionSizeLimit -= ScheduleRegionSize;
	if (ScheduleRegionSizeLimit < MinScheduleRegionSize)
	ScheduleRegionSizeLimit = MinScheduleRegionSize;
	ScheduleRegionSize = 0;

	// Make a new scheduling region, i.e. all existing ScheduleData is not
	// in the new region yet.
	++SchedulingRegionID;
	}

	ScheduleData getScheduleData(Value V) {
	ScheduleData *SD = ScheduleDataMap[V];
	if (SD && SD->SchedulingRegionID == SchedulingRegionID)
	return SD;
	return nullptr;
	}

	bool isInSchedulingRegion(ScheduleData *SD) {
	return SD->SchedulingRegionID == SchedulingRegionID;
	}

	/// Marks an instruction as scheduled and puts all dependent ready
	/// instructions into the ready-list.
	template <typename ReadyListType>
	void schedule(ScheduleData *SD, ReadyListType &ReadyList) {
	SD->IsScheduled = true;
	DEBUG(dbgs() << "SLP: schedule " << *SD << "\n");

	ScheduleData *BundleMember = SD;
	while (BundleMember) {
	// Handle the def-use chain dependencies.
	for (Use &U : BundleMember->Inst->operands()) {
	ScheduleData *OpDef = getScheduleData(U.get());
	if (OpDef && OpDef->hasValidDependencies() &&
	OpDef->incrementUnscheduledDeps(-1) == 0) {
	// There are no more unscheduled dependencies after decrementing,
	// so we can put the dependent instruction into the ready list.
	ScheduleData *DepBundle = OpDef->FirstInBundle;
	assert(!DepBundle->IsScheduled &&
	"already scheduled bundle gets ready");
	ReadyList.insert(DepBundle);
	DEBUG(dbgs() << "SLP: gets ready (def): " << *DepBundle << "\n");
	}
	}
	// Handle the memory dependencies.
	for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) {
	if (MemoryDepSD->incrementUnscheduledDeps(-1) == 0) {
	// There are no more unscheduled dependencies after decrementing,
	// so we can put the dependent instruction into the ready list.
	ScheduleData *DepBundle = MemoryDepSD->FirstInBundle;
	assert(!DepBundle->IsScheduled &&
	"already scheduled bundle gets ready");
	ReadyList.insert(DepBundle);
	DEBUG(dbgs() << "SLP: gets ready (mem): " << *DepBundle << "\n");
	}
	}
	BundleMember = BundleMember->NextInBundle;
	}
	}

	/// Put all instructions into the ReadyList which are ready for scheduling.
	template <typename ReadyListType>
	void initialFillReadyList(ReadyListType &ReadyList) {
	for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
	ScheduleData *SD = getScheduleData(I);
	if (SD->isSchedulingEntity() && SD->isReady()) {
	ReadyList.insert(SD);
	DEBUG(dbgs() << "SLP: initially in ready list: " << *I << "\n");
	}
	}
	}

	/// Checks if a bundle of instructions can be scheduled, i.e. has no
	/// cyclic dependencies. This is only a dry-run, no instructions are
	/// actually moved at this stage.
	bool tryScheduleBundle(ArrayRef<Value > VL, BoUpSLP SLP);

	/// Un-bundles a group of instructions.
	void cancelScheduling(ArrayRef<Value *> VL);

	/// Extends the scheduling region so that V is inside the region.
	/// \returns true if the region size is within the limit.
	bool extendSchedulingRegion(Value *V);

	/// Initialize the ScheduleData structures for new instructions in the
	/// scheduling region.
	void initScheduleData(Instruction FromI, Instruction ToI,
	ScheduleData *PrevLoadStore,
	ScheduleData *NextLoadStore);

	/// Updates the dependency information of a bundle and of all instructions/
	/// bundles which depend on the original bundle.
	void calculateDependencies(ScheduleData *SD, bool InsertInReadyList,
	BoUpSLP *SLP);

	/// Sets all instruction in the scheduling region to un-scheduled.
	void resetSchedule();

	BasicBlock *BB;

	/// Simple memory allocation for ScheduleData.
	std::vector<std::unique_ptr<ScheduleData[]>> ScheduleDataChunks;

	/// The size of a ScheduleData array in ScheduleDataChunks.
	int ChunkSize;

	/// The allocator position in the current chunk, which is the last entry
	/// of ScheduleDataChunks.
	int ChunkPos;

	/// Attaches ScheduleData to Instruction.
	/// Note that the mapping survives during all vectorization iterations, i.e.
	/// ScheduleData structures are recycled.
	DenseMap<Value , ScheduleData > ScheduleDataMap;

	struct ReadyList : SmallVector<ScheduleData *, 8> {
	void insert(ScheduleData *SD) { push_back(SD); }
	};

	/// The ready-list for scheduling (only used for the dry-run).
	ReadyList ReadyInsts;

	/// The first instruction of the scheduling region.
	Instruction *ScheduleStart;

	/// The first instruction _after_ the scheduling region.
	Instruction *ScheduleEnd;

	/// The first memory accessing instruction in the scheduling region
	/// (can be null).
	ScheduleData *FirstLoadStoreInRegion;

	/// The last memory accessing instruction in the scheduling region
	/// (can be null).
	ScheduleData *LastLoadStoreInRegion;

	/// The current size of the scheduling region.
	int ScheduleRegionSize;

	/// The maximum size allowed for the scheduling region.
	int ScheduleRegionSizeLimit;

	/// The ID of the scheduling region. For a new vectorization iteration this
	/// is incremented which "removes" all ScheduleData from the region.
	int SchedulingRegionID;
	};

	/// Attaches the BlockScheduling structures to basic blocks.
	MapVector<BasicBlock *, std::unique_ptr<BlockScheduling>> BlocksSchedules;

	/// Performs the "real" scheduling. Done before vectorization is actually
	/// performed in a basic block.
	void scheduleBlock(BlockScheduling *BS);

	/// List of users to ignore during scheduling and that don't need extracting.
	ArrayRef<Value *> UserIgnoreList;

	// Number of load bundles that contain consecutive loads.
	int NumLoadsWantToKeepOrder;

	// Number of load bundles that contain consecutive loads in reversed order.
	int NumLoadsWantToChangeOrder;

	// Analysis and block reference.
	Function *F;
	ScalarEvolution *SE;
	TargetTransformInfo *TTI;
	TargetLibraryInfo *TLI;
	AliasAnalysis *AA;
	LoopInfo *LI;
	DominatorTree *DT;
	AssumptionCache *AC;
	DemandedBits *DB;
	const DataLayout *DL;
	unsigned MaxVecRegSize; // This is set by TTI or overridden by cl::opt.
	unsigned MinVecRegSize; // Set by cl::opt (default: 128).
	/// Instruction builder to construct the vectorized tree.
	IRBuilder<> Builder;

	/// A map of scalar integer values to the smallest bit width with which they
	/// can legally be represented. The values map to (width, signed) pairs,
	/// where "width" indicates the minimum bit width and "signed" is True if the
	/// value must be signed-extended, rather than zero-extended, back to its
	/// original width.
	MapVector<Value *, std::pair<uint64_t, bool>> MinBWs;
	};

	} // end namespace llvm
	} // end namespace slpvectorizer

	void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
	ArrayRef<Value *> UserIgnoreLst) {
	deleteTree();
	UserIgnoreList = UserIgnoreLst;
	if (!allSameType(Roots))
	return;
	buildTree_rec(Roots, 0);

	// Collect the values that we need to extract from the tree.
	for (TreeEntry &EIdx : VectorizableTree) {
	TreeEntry *Entry = &EIdx;

	// For each lane:
	for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
	Value *Scalar = Entry->Scalars[Lane];

	// No need to handle users of gathered values.
	if (Entry->NeedToGather)
	continue;

	for (User *U : Scalar->users()) {
	DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n");

	Instruction *UserInst = dyn_cast<Instruction>(U);
	if (!UserInst)
	continue;

	// Skip in-tree scalars that become vectors
	if (ScalarToTreeEntry.count(U)) {
	int Idx = ScalarToTreeEntry[U];
	TreeEntry *UseEntry = &VectorizableTree[Idx];
	Value *UseScalar = UseEntry->Scalars[0];
	// Some in-tree scalars will remain as scalar in vectorized
	// instructions. If that is the case, the one in Lane 0 will
	// be used.
	if (UseScalar != U \|\|
	!InTreeUserNeedToExtract(Scalar, UserInst, TLI)) {
	DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U
	<< ".\n");
	assert(!VectorizableTree[Idx].NeedToGather && "Bad state");
	continue;
	}
	}

	// Ignore users in the user ignore list.
	if (is_contained(UserIgnoreList, UserInst))
	continue;

	DEBUG(dbgs() << "SLP: Need to extract:" << *U << " from lane " <<
	Lane << " from " << *Scalar << ".\n");
	ExternalUses.push_back(ExternalUser(Scalar, U, Lane));
	}
	}
	}
	}


	void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
	bool isAltShuffle = false;
	assert((allConstant(VL) \|\| allSameType(VL)) && "Invalid types!");

	if (Depth == RecursionMaxDepth) {
	DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n");
	newTreeEntry(VL, false);
	return;
	}

	// Don't handle vectors.
	if (VL[0]->getType()->isVectorTy()) {
	DEBUG(dbgs() << "SLP: Gathering due to vector type.\n");
	newTreeEntry(VL, false);
	return;
	}

	if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
	if (SI->getValueOperand()->getType()->isVectorTy()) {
	DEBUG(dbgs() << "SLP: Gathering due to store vector type.\n");
	newTreeEntry(VL, false);
	return;
	}
	unsigned Opcode = getSameOpcode(VL);

	// Check that this shuffle vector refers to the alternate
	// sequence of opcodes.
	if (Opcode == Instruction::ShuffleVector) {
	Instruction *I0 = dyn_cast<Instruction>(VL[0]);
	unsigned Op = I0->getOpcode();
	if (Op != Instruction::ShuffleVector)
	isAltShuffle = true;
	}

	// If all of the operands are identical or constant we have a simple solution.
	if (allConstant(VL) \|\| isSplat(VL) \|\| !allSameBlock(VL) \|\| !Opcode) {
	DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n");
	newTreeEntry(VL, false);
	return;
	}

	// We now know that this is a vector of instructions of the same type from
	// the same block.

	// Don't vectorize ephemeral values.
	for (unsigned i = 0, e = VL.size(); i != e; ++i) {
	if (EphValues.count(VL[i])) {
	DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] <<
	") is ephemeral.\n");
	newTreeEntry(VL, false);
	return;
	}
	}

	// Check if this is a duplicate of another entry.
	if (ScalarToTreeEntry.count(VL[0])) {
	int Idx = ScalarToTreeEntry[VL[0]];
	TreeEntry *E = &VectorizableTree[Idx];
	for (unsigned i = 0, e = VL.size(); i != e; ++i) {
	DEBUG(dbgs() << "SLP: \tChecking bundle: " << *VL[i] << ".\n");
	if (E->Scalars[i] != VL[i]) {
	DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n");
	newTreeEntry(VL, false);
	return;
	}
	}
	DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *VL[0] << ".\n");
	return;
	}

	// Check that none of the instructions in the bundle are already in the tree.
	for (unsigned i = 0, e = VL.size(); i != e; ++i) {
	if (ScalarToTreeEntry.count(VL[i])) {
	DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] <<
	") is already in tree.\n");
	newTreeEntry(VL, false);
	return;
	}
	}

	// If any of the scalars is marked as a value that needs to stay scalar then
	// we need to gather the scalars.
	for (unsigned i = 0, e = VL.size(); i != e; ++i) {
	if (MustGather.count(VL[i])) {
	DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n");
	newTreeEntry(VL, false);
	return;
	}
	}

	// Check that all of the users of the scalars that we want to vectorize are
	// schedulable.
	Instruction *VL0 = cast<Instruction>(VL[0]);
	BasicBlock *BB = cast<Instruction>(VL0)->getParent();

	if (!DT->isReachableFromEntry(BB)) {
	// Don't go into unreachable blocks. They may contain instructions with
	// dependency cycles which confuse the final scheduling.
	DEBUG(dbgs() << "SLP: bundle in unreachable block.\n");
	newTreeEntry(VL, false);
	return;
	}

	// Check that every instructions appears once in this bundle.
	for (unsigned i = 0, e = VL.size(); i < e; ++i)
	for (unsigned j = i+1; j < e; ++j)
	if (VL[i] == VL[j]) {
	DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
	newTreeEntry(VL, false);
	return;
	}

	auto &BSRef = BlocksSchedules[BB];
	if (!BSRef) {
	BSRef = llvm::make_unique<BlockScheduling>(BB);
	}
	BlockScheduling &BS = *BSRef.get();

	if (!BS.tryScheduleBundle(VL, this)) {
	DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n");
	assert((!BS.getScheduleData(VL[0]) \|\|
	!BS.getScheduleData(VL[0])->isPartOfBundle()) &&
	"tryScheduleBundle should cancelScheduling on failure");
	newTreeEntry(VL, false);
	return;
	}
	DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n");

	switch (Opcode) {
	case Instruction::PHI: {
	PHINode *PH = dyn_cast<PHINode>(VL0);

	// Check for terminator values (e.g. invoke).
	for (unsigned j = 0; j < VL.size(); ++j)
	for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
	TerminatorInst *Term = dyn_cast<TerminatorInst>(
	cast<PHINode>(VL[j])->getIncomingValueForBlock(PH->getIncomingBlock(i)));
	if (Term) {
	DEBUG(dbgs() << "SLP: Need to swizzle PHINodes (TerminatorInst use).\n");
	BS.cancelScheduling(VL);
	newTreeEntry(VL, false);
	return;
	}
	}

	newTreeEntry(VL, true);
	DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n");

	for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
	ValueList Operands;
	// Prepare the operand vector.
	for (Value *j : VL)
	Operands.push_back(cast<PHINode>(j)->getIncomingValueForBlock(
	PH->getIncomingBlock(i)));

	buildTree_rec(Operands, Depth + 1);
	}
	return;
	}
	case Instruction::ExtractValue:
	case Instruction::ExtractElement: {
	bool Reuse = canReuseExtract(VL, Opcode);
	if (Reuse) {
	DEBUG(dbgs() << "SLP: Reusing extract sequence.\n");
	} else {
	BS.cancelScheduling(VL);
	}
	newTreeEntry(VL, Reuse);
	return;
	}
	case Instruction::Load: {
	// Check that a vectorized load would load the same memory as a scalar
	// load.
	// For example we don't want vectorize loads that are smaller than 8 bit.
	// Even though we have a packed struct {<i2, i2, i2, i2>} LLVM treats
	// loading/storing it as an i8 struct. If we vectorize loads/stores from
	// such a struct we read/write packed bits disagreeing with the
	// unvectorized version.
	Type *ScalarTy = VL[0]->getType();

	if (DL->getTypeSizeInBits(ScalarTy) !=
	DL->getTypeAllocSizeInBits(ScalarTy)) {
	BS.cancelScheduling(VL);
	newTreeEntry(VL, false);
	DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n");
	return;
	}

	// Make sure all loads in the bundle are simple - we can't vectorize
	// atomic or volatile loads.
	for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) {
	LoadInst *L = cast<LoadInst>(VL[i]);
	if (!L->isSimple()) {
	BS.cancelScheduling(VL);
	newTreeEntry(VL, false);
	DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n");
	return;
	}
	}

	// Check if the loads are consecutive, reversed, or neither.
	// TODO: What we really want is to sort the loads, but for now, check
	// the two likely directions.
	bool Consecutive = true;
	bool ReverseConsecutive = true;
	for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) {
	if (!isConsecutiveAccess(VL[i], VL[i + 1], DL, SE)) {
	Consecutive = false;
	break;
	} else {
	ReverseConsecutive = false;
	}
	}

	if (Consecutive) {
	++NumLoadsWantToKeepOrder;
	newTreeEntry(VL, true);
	DEBUG(dbgs() << "SLP: added a vector of loads.\n");
	return;
	}

	// If none of the load pairs were consecutive when checked in order,
	// check the reverse order.
	if (ReverseConsecutive)
	for (unsigned i = VL.size() - 1; i > 0; --i)
	if (!isConsecutiveAccess(VL[i], VL[i - 1], DL, SE)) {
	ReverseConsecutive = false;
	break;
	}

	BS.cancelScheduling(VL);
	newTreeEntry(VL, false);

	if (ReverseConsecutive) {
	++NumLoadsWantToChangeOrder;
	DEBUG(dbgs() << "SLP: Gathering reversed loads.\n");
	} else {
	DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n");
	}
	return;
	}
	case Instruction::ZExt:
	case Instruction::SExt:
	case Instruction::FPToUI:
	case Instruction::FPToSI:
	case Instruction::FPExt:
	case Instruction::PtrToInt:
	case Instruction::IntToPtr:
	case Instruction::SIToFP:
	case Instruction::UIToFP:
	case Instruction::Trunc:
	case Instruction::FPTrunc:
	case Instruction::BitCast: {
	Type *SrcTy = VL0->getOperand(0)->getType();
	for (unsigned i = 0; i < VL.size(); ++i) {
	Type *Ty = cast<Instruction>(VL[i])->getOperand(0)->getType();
	if (Ty != SrcTy \|\| !isValidElementType(Ty)) {
	BS.cancelScheduling(VL);
	newTreeEntry(VL, false);
	DEBUG(dbgs() << "SLP: Gathering casts with different src types.\n");
	return;
	}
	}
	newTreeEntry(VL, true);
	DEBUG(dbgs() << "SLP: added a vector of casts.\n");

	for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
	ValueList Operands;
	// Prepare the operand vector.
	for (Value *j : VL)
	Operands.push_back(cast<Instruction>(j)->getOperand(i));

	buildTree_rec(Operands, Depth+1);
	}
	return;
	}
	case Instruction::ICmp:
	case Instruction::FCmp: {
	// Check that all of the compares have the same predicate.
	CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
	Type *ComparedTy = cast<Instruction>(VL[0])->getOperand(0)->getType();
	for (unsigned i = 1, e = VL.size(); i < e; ++i) {
	CmpInst *Cmp = cast<CmpInst>(VL[i]);
	if (Cmp->getPredicate() != P0 \|\|
	Cmp->getOperand(0)->getType() != ComparedTy) {
	BS.cancelScheduling(VL);
	newTreeEntry(VL, false);
	DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n");
	return;
	}
	}

	newTreeEntry(VL, true);
	DEBUG(dbgs() << "SLP: added a vector of compares.\n");

	for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
	ValueList Operands;
	// Prepare the operand vector.
	for (Value *j : VL)
	Operands.push_back(cast<Instruction>(j)->getOperand(i));

	buildTree_rec(Operands, Depth+1);
	}
	return;
	}
	case Instruction::Select:
	case Instruction::Add:
	case Instruction::FAdd:
	case Instruction::Sub:
	case Instruction::FSub:
	case Instruction::Mul:
	case Instruction::FMul:
	case Instruction::UDiv:
	case Instruction::SDiv:
	case Instruction::FDiv:
	case Instruction::URem:
	case Instruction::SRem:
	case Instruction::FRem:
	case Instruction::Shl:
	case Instruction::LShr:
	case Instruction::AShr:
	case Instruction::And:
	case Instruction::Or:
	case Instruction::Xor: {
	newTreeEntry(VL, true);
	DEBUG(dbgs() << "SLP: added a vector of bin op.\n");

	// Sort operands of the instructions so that each side is more likely to
	// have the same opcode.
	if (isa<BinaryOperator>(VL0) && VL0->isCommutative()) {
	ValueList Left, Right;
	reorderInputsAccordingToOpcode(VL, Left, Right);
	buildTree_rec(Left, Depth + 1);
	buildTree_rec(Right, Depth + 1);
	return;
	}

	for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
	ValueList Operands;
	// Prepare the operand vector.
	for (Value *j : VL)
	Operands.push_back(cast<Instruction>(j)->getOperand(i));

	buildTree_rec(Operands, Depth+1);
	}
	return;
	}
	case Instruction::GetElementPtr: {
	// We don't combine GEPs with complicated (nested) indexing.
	for (unsigned j = 0; j < VL.size(); ++j) {
	if (cast<Instruction>(VL[j])->getNumOperands() != 2) {
	DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n");
	BS.cancelScheduling(VL);
	newTreeEntry(VL, false);
	return;
	}
	}

	// We can't combine several GEPs into one vector if they operate on
	// different types.
	Type *Ty0 = cast<Instruction>(VL0)->getOperand(0)->getType();
	for (unsigned j = 0; j < VL.size(); ++j) {
	Type *CurTy = cast<Instruction>(VL[j])->getOperand(0)->getType();
	if (Ty0 != CurTy) {
	DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n");
	BS.cancelScheduling(VL);
	newTreeEntry(VL, false);
	return;
	}
	}

	// We don't combine GEPs with non-constant indexes.
	for (unsigned j = 0; j < VL.size(); ++j) {
	auto Op = cast<Instruction>(VL[j])->getOperand(1);
	if (!isa<ConstantInt>(Op)) {
	DEBUG(
	dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n");
	BS.cancelScheduling(VL);
	newTreeEntry(VL, false);
	return;
	}
	}

	newTreeEntry(VL, true);
	DEBUG(dbgs() << "SLP: added a vector of GEPs.\n");
	for (unsigned i = 0, e = 2; i < e; ++i) {
	ValueList Operands;
	// Prepare the operand vector.
	for (Value *j : VL)
	Operands.push_back(cast<Instruction>(j)->getOperand(i));

	buildTree_rec(Operands, Depth + 1);
	}
	return;
	}
	case Instruction::Store: {
	// Check if the stores are consecutive or of we need to swizzle them.
	for (unsigned i = 0, e = VL.size() - 1; i < e; ++i)
	if (!isConsecutiveAccess(VL[i], VL[i + 1], DL, SE)) {
	BS.cancelScheduling(VL);
	newTreeEntry(VL, false);
	DEBUG(dbgs() << "SLP: Non-consecutive store.\n");
	return;
	}

	newTreeEntry(VL, true);
	DEBUG(dbgs() << "SLP: added a vector of stores.\n");

	ValueList Operands;
	for (Value *j : VL)
	Operands.push_back(cast<Instruction>(j)->getOperand(0));

	buildTree_rec(Operands, Depth + 1);
	return;
	}
	case Instruction::Call: {
	// Check if the calls are all to the same vectorizable intrinsic.
	CallInst *CI = cast<CallInst>(VL[0]);
	// Check if this is an Intrinsic call or something that can be
	// represented by an intrinsic call
	Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
	if (!isTriviallyVectorizable(ID)) {
	BS.cancelScheduling(VL);
	newTreeEntry(VL, false);
	DEBUG(dbgs() << "SLP: Non-vectorizable call.\n");
	return;
	}
	Function *Int = CI->getCalledFunction();
	Value *A1I = nullptr;
	if (hasVectorInstrinsicScalarOpd(ID, 1))
	A1I = CI->getArgOperand(1);
	for (unsigned i = 1, e = VL.size(); i != e; ++i) {
	CallInst *CI2 = dyn_cast<CallInst>(VL[i]);
	if (!CI2 \|\| CI2->getCalledFunction() != Int \|\|
	getVectorIntrinsicIDForCall(CI2, TLI) != ID \|\|
	!CI->hasIdenticalOperandBundleSchema(*CI2)) {
	BS.cancelScheduling(VL);
	newTreeEntry(VL, false);
	DEBUG(dbgs() << "SLP: mismatched calls:" << CI << "!=" << VL[i]
	<< "\n");
	return;
	}
	// ctlz,cttz and powi are special intrinsics whose second argument
	// should be same in order for them to be vectorized.
	if (hasVectorInstrinsicScalarOpd(ID, 1)) {
	Value *A1J = CI2->getArgOperand(1);
	if (A1I != A1J) {
	BS.cancelScheduling(VL);
	newTreeEntry(VL, false);
	DEBUG(dbgs() << "SLP: mismatched arguments in call:" << *CI
	<< " argument "<< A1I<<"!=" << A1J
	<< "\n");
	return;
	}
	}
	// Verify that the bundle operands are identical between the two calls.
	if (CI->hasOperandBundles() &&
	!std::equal(CI->op_begin() + CI->getBundleOperandsStartIndex(),
	CI->op_begin() + CI->getBundleOperandsEndIndex(),
	CI2->op_begin() + CI2->getBundleOperandsStartIndex())) {
	BS.cancelScheduling(VL);
	newTreeEntry(VL, false);
	DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:" << *CI << "!="
	<< *VL[i] << '\n');
	return;
	}
	}

	newTreeEntry(VL, true);
	for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) {
	ValueList Operands;
	// Prepare the operand vector.
	for (Value *j : VL) {
	CallInst *CI2 = dyn_cast<CallInst>(j);
	Operands.push_back(CI2->getArgOperand(i));
	}
	buildTree_rec(Operands, Depth + 1);
	}
	return;
	}
	case Instruction::ShuffleVector: {
	// If this is not an alternate sequence of opcode like add-sub
	// then do not vectorize this instruction.
	if (!isAltShuffle) {
	BS.cancelScheduling(VL);
	newTreeEntry(VL, false);
	DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n");
	return;
	}
	newTreeEntry(VL, true);
	DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n");

	// Reorder operands if reordering would enable vectorization.
	if (isa<BinaryOperator>(VL0)) {
	ValueList Left, Right;
	reorderAltShuffleOperands(VL, Left, Right);
	buildTree_rec(Left, Depth + 1);
	buildTree_rec(Right, Depth + 1);
	return;
	}

	for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
	ValueList Operands;
	// Prepare the operand vector.
	for (Value *j : VL)
	Operands.push_back(cast<Instruction>(j)->getOperand(i));

	buildTree_rec(Operands, Depth + 1);
	}
	return;
	}
	default:
	BS.cancelScheduling(VL);
	newTreeEntry(VL, false);
	DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n");
	return;
	}
	}

	unsigned BoUpSLP::canMapToVector(Type *T, const DataLayout &DL) const {
	unsigned N;
	Type *EltTy;
	auto *ST = dyn_cast<StructType>(T);
	if (ST) {
	N = ST->getNumElements();
	EltTy = *ST->element_begin();
	} else {
	N = cast<ArrayType>(T)->getNumElements();
	EltTy = cast<ArrayType>(T)->getElementType();
	}
	if (!isValidElementType(EltTy))
	return 0;
	uint64_t VTSize = DL.getTypeStoreSizeInBits(VectorType::get(EltTy, N));
	if (VTSize < MinVecRegSize \|\| VTSize > MaxVecRegSize \|\| VTSize != DL.getTypeStoreSizeInBits(T))
	return 0;
	if (ST) {
	// Check that struct is homogeneous.
	for (const auto *Ty : ST->elements())
	if (Ty != EltTy)
	return 0;
	}
	return N;
	}

	bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, unsigned Opcode) const {
	assert(Opcode == Instruction::ExtractElement \|\|
	Opcode == Instruction::ExtractValue);
	assert(Opcode == getSameOpcode(VL) && "Invalid opcode");
	// Check if all of the extracts come from the same vector and from the
	// correct offset.
	Value *VL0 = VL[0];
	Instruction *E0 = cast<Instruction>(VL0);
	Value *Vec = E0->getOperand(0);

	// We have to extract from a vector/aggregate with the same number of elements.
	unsigned NElts;
	if (Opcode == Instruction::ExtractValue) {
	const DataLayout &DL = E0->getModule()->getDataLayout();
	NElts = canMapToVector(Vec->getType(), DL);
	if (!NElts)
	return false;
	// Check if load can be rewritten as load of vector.
	LoadInst *LI = dyn_cast<LoadInst>(Vec);
	if (!LI \|\| !LI->isSimple() \|\| !LI->hasNUses(VL.size()))
	return false;
	} else {
	NElts = Vec->getType()->getVectorNumElements();
	}

	if (NElts != VL.size())
	return false;

	// Check that all of the indices extract from the correct offset.
	if (!matchExtractIndex(E0, 0, Opcode))
	return false;

	for (unsigned i = 1, e = VL.size(); i < e; ++i) {
	Instruction *E = cast<Instruction>(VL[i]);
	if (!matchExtractIndex(E, i, Opcode))
	return false;
	if (E->getOperand(0) != Vec)
	return false;
	}

	return true;
	}

	int BoUpSLP::getEntryCost(TreeEntry *E) {
	ArrayRef<Value*> VL = E->Scalars;

	Type *ScalarTy = VL[0]->getType();
	if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
	ScalarTy = SI->getValueOperand()->getType();
	VectorType *VecTy = VectorType::get(ScalarTy, VL.size());

	// If we have computed a smaller type for the expression, update VecTy so
	// that the costs will be accurate.
	if (MinBWs.count(VL[0]))
	VecTy = VectorType::get(
	IntegerType::get(F->getContext(), MinBWs[VL[0]].first), VL.size());

	if (E->NeedToGather) {
	if (allConstant(VL))
	return 0;
	if (isSplat(VL)) {
	return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, 0);
	}
	return getGatherCost(E->Scalars);
	}
	unsigned Opcode = getSameOpcode(VL);
	assert(Opcode && allSameType(VL) && allSameBlock(VL) && "Invalid VL");
	Instruction *VL0 = cast<Instruction>(VL[0]);
	switch (Opcode) {
	case Instruction::PHI: {
	return 0;
	}
	case Instruction::ExtractValue:
	case Instruction::ExtractElement: {
	if (canReuseExtract(VL, Opcode)) {
	int DeadCost = 0;
	for (unsigned i = 0, e = VL.size(); i < e; ++i) {
	Instruction *E = cast<Instruction>(VL[i]);
	if (E->hasOneUse())
	// Take credit for instruction that will become dead.
	DeadCost +=
	TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, i);
	}
	return -DeadCost;
	}
	return getGatherCost(VecTy);
	}
	case Instruction::ZExt:
	case Instruction::SExt:
	case Instruction::FPToUI:
	case Instruction::FPToSI:
	case Instruction::FPExt:
	case Instruction::PtrToInt:
	case Instruction::IntToPtr:
	case Instruction::SIToFP:
	case Instruction::UIToFP:
	case Instruction::Trunc:
	case Instruction::FPTrunc:
	case Instruction::BitCast: {
	Type *SrcTy = VL0->getOperand(0)->getType();

	// Calculate the cost of this instruction.
	int ScalarCost = VL.size() * TTI->getCastInstrCost(VL0->getOpcode(),
	VL0->getType(), SrcTy);

	VectorType *SrcVecTy = VectorType::get(SrcTy, VL.size());
	int VecCost = TTI->getCastInstrCost(VL0->getOpcode(), VecTy, SrcVecTy);
	return VecCost - ScalarCost;
	}
	case Instruction::FCmp:
	case Instruction::ICmp:
	case Instruction::Select: {
	// Calculate the cost of this instruction.
	VectorType *MaskTy = VectorType::get(Builder.getInt1Ty(), VL.size());
	int ScalarCost = VecTy->getNumElements() *
	TTI->getCmpSelInstrCost(Opcode, ScalarTy, Builder.getInt1Ty());
	int VecCost = TTI->getCmpSelInstrCost(Opcode, VecTy, MaskTy);
	return VecCost - ScalarCost;
	}
	case Instruction::Add:
	case Instruction::FAdd:
	case Instruction::Sub:
	case Instruction::FSub:
	case Instruction::Mul:
	case Instruction::FMul:
	case Instruction::UDiv:
	case Instruction::SDiv:
	case Instruction::FDiv:
	case Instruction::URem:
	case Instruction::SRem:
	case Instruction::FRem:
	case Instruction::Shl:
	case Instruction::LShr:
	case Instruction::AShr:
	case Instruction::And:
	case Instruction::Or:
	case Instruction::Xor: {
	// Certain instructions can be cheaper to vectorize if they have a
	// constant second vector operand.
	TargetTransformInfo::OperandValueKind Op1VK =
	TargetTransformInfo::OK_AnyValue;
	TargetTransformInfo::OperandValueKind Op2VK =
	TargetTransformInfo::OK_UniformConstantValue;
	TargetTransformInfo::OperandValueProperties Op1VP =
	TargetTransformInfo::OP_None;
	TargetTransformInfo::OperandValueProperties Op2VP =
	TargetTransformInfo::OP_None;

	// If all operands are exactly the same ConstantInt then set the
	// operand kind to OK_UniformConstantValue.
	// If instead not all operands are constants, then set the operand kind
	// to OK_AnyValue. If all operands are constants but not the same,
	// then set the operand kind to OK_NonUniformConstantValue.
	ConstantInt *CInt = nullptr;
	for (unsigned i = 0; i < VL.size(); ++i) {
	const Instruction *I = cast<Instruction>(VL[i]);
	if (!isa<ConstantInt>(I->getOperand(1))) {
	Op2VK = TargetTransformInfo::OK_AnyValue;
	break;
	}
	if (i == 0) {
	CInt = cast<ConstantInt>(I->getOperand(1));
	continue;
	}
	if (Op2VK == TargetTransformInfo::OK_UniformConstantValue &&
	CInt != cast<ConstantInt>(I->getOperand(1)))
	Op2VK = TargetTransformInfo::OK_NonUniformConstantValue;
	}
	// FIXME: Currently cost of model modification for division by power of
	// 2 is handled for X86 and AArch64. Add support for other targets.
	if (Op2VK == TargetTransformInfo::OK_UniformConstantValue && CInt &&
	CInt->getValue().isPowerOf2())
	Op2VP = TargetTransformInfo::OP_PowerOf2;

	int ScalarCost = VecTy->getNumElements() *
	TTI->getArithmeticInstrCost(Opcode, ScalarTy, Op1VK,
	Op2VK, Op1VP, Op2VP);
	int VecCost = TTI->getArithmeticInstrCost(Opcode, VecTy, Op1VK, Op2VK,
	Op1VP, Op2VP);
	return VecCost - ScalarCost;
	}
	case Instruction::GetElementPtr: {
	TargetTransformInfo::OperandValueKind Op1VK =
	TargetTransformInfo::OK_AnyValue;
	TargetTransformInfo::OperandValueKind Op2VK =
	TargetTransformInfo::OK_UniformConstantValue;

	int ScalarCost =
	VecTy->getNumElements() *
	TTI->getArithmeticInstrCost(Instruction::Add, ScalarTy, Op1VK, Op2VK);
	int VecCost =
	TTI->getArithmeticInstrCost(Instruction::Add, VecTy, Op1VK, Op2VK);

	return VecCost - ScalarCost;
	}
	case Instruction::Load: {
	// Cost of wide load - cost of scalar loads.
	unsigned alignment = dyn_cast<LoadInst>(VL0)->getAlignment();
	int ScalarLdCost = VecTy->getNumElements() *
	TTI->getMemoryOpCost(Instruction::Load, ScalarTy, alignment, 0);
	int VecLdCost = TTI->getMemoryOpCost(Instruction::Load,
	VecTy, alignment, 0);
	return VecLdCost - ScalarLdCost;
	}
	case Instruction::Store: {
	// We know that we can merge the stores. Calculate the cost.
	unsigned alignment = dyn_cast<StoreInst>(VL0)->getAlignment();
	int ScalarStCost = VecTy->getNumElements() *
	TTI->getMemoryOpCost(Instruction::Store, ScalarTy, alignment, 0);
	int VecStCost = TTI->getMemoryOpCost(Instruction::Store,
	VecTy, alignment, 0);
	return VecStCost - ScalarStCost;
	}
	case Instruction::Call: {
	CallInst *CI = cast<CallInst>(VL0);
	Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);

	// Calculate the cost of the scalar and vector calls.
	SmallVector<Type*, 4> ScalarTys, VecTys;
	for (unsigned op = 0, opc = CI->getNumArgOperands(); op!= opc; ++op) {
	ScalarTys.push_back(CI->getArgOperand(op)->getType());
	VecTys.push_back(VectorType::get(CI->getArgOperand(op)->getType(),
	VecTy->getNumElements()));
	}

	FastMathFlags FMF;
	if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
	FMF = FPMO->getFastMathFlags();

	int ScalarCallCost = VecTy->getNumElements() *
	TTI->getIntrinsicInstrCost(ID, ScalarTy, ScalarTys, FMF);

	int VecCallCost = TTI->getIntrinsicInstrCost(ID, VecTy, VecTys, FMF);

	DEBUG(dbgs() << "SLP: Call cost "<< VecCallCost - ScalarCallCost
	<< " (" << VecCallCost << "-" << ScalarCallCost << ")"
	<< " for " << *CI << "\n");

	return VecCallCost - ScalarCallCost;
	}
	case Instruction::ShuffleVector: {
	TargetTransformInfo::OperandValueKind Op1VK =
	TargetTransformInfo::OK_AnyValue;
	TargetTransformInfo::OperandValueKind Op2VK =
	TargetTransformInfo::OK_AnyValue;
	int ScalarCost = 0;
	int VecCost = 0;
	for (Value *i : VL) {
	Instruction *I = cast<Instruction>(i);
	if (!I)
	break;
	ScalarCost +=
	TTI->getArithmeticInstrCost(I->getOpcode(), ScalarTy, Op1VK, Op2VK);
	}
	// VecCost is equal to sum of the cost of creating 2 vectors
	// and the cost of creating shuffle.
	Instruction *I0 = cast<Instruction>(VL[0]);
	VecCost =
	TTI->getArithmeticInstrCost(I0->getOpcode(), VecTy, Op1VK, Op2VK);
	Instruction *I1 = cast<Instruction>(VL[1]);
	VecCost +=
	TTI->getArithmeticInstrCost(I1->getOpcode(), VecTy, Op1VK, Op2VK);
	VecCost +=
	TTI->getShuffleCost(TargetTransformInfo::SK_Alternate, VecTy, 0);
	return VecCost - ScalarCost;
	}
	default:
	llvm_unreachable("Unknown instruction");
	}
	}

	bool BoUpSLP::isFullyVectorizableTinyTree() {
	DEBUG(dbgs() << "SLP: Check whether the tree with height " <<
	VectorizableTree.size() << " is fully vectorizable .\n");

	// We only handle trees of heights 1 and 2.
	if (VectorizableTree.size() == 1 && !VectorizableTree[0].NeedToGather)
	return true;

	if (VectorizableTree.size() != 2)
	return false;

	// Handle splat and all-constants stores.
	if (!VectorizableTree[0].NeedToGather &&
	(allConstant(VectorizableTree[1].Scalars) \|\|
	isSplat(VectorizableTree[1].Scalars)))
	return true;

	// Gathering cost would be too much for tiny trees.
	if (VectorizableTree[0].NeedToGather \|\| VectorizableTree[1].NeedToGather)
	return false;

	return true;
	}

	bool BoUpSLP::isTreeTinyAndNotFullyVectorizable() {

	// We can vectorize the tree if its size is greater than or equal to the
	// minimum size specified by the MinTreeSize command line option.
	if (VectorizableTree.size() >= MinTreeSize)
	return false;

	// If we have a tiny tree (a tree whose size is less than MinTreeSize), we
	// can vectorize it if we can prove it fully vectorizable.
	if (isFullyVectorizableTinyTree())
	return false;

	assert(VectorizableTree.empty()
	? ExternalUses.empty()
	: true && "We shouldn't have any external users");

	// Otherwise, we can't vectorize the tree. It is both tiny and not fully
	// vectorizable.
	return true;
	}

	int BoUpSLP::getSpillCost() {
	// Walk from the bottom of the tree to the top, tracking which values are
	// live. When we see a call instruction that is not part of our tree,
	// query TTI to see if there is a cost to keeping values live over it
	// (for example, if spills and fills are required).
	unsigned BundleWidth = VectorizableTree.front().Scalars.size();
	int Cost = 0;

	SmallPtrSet<Instruction*, 4> LiveValues;
	Instruction *PrevInst = nullptr;

	for (const auto &N : VectorizableTree) {
	Instruction *Inst = dyn_cast<Instruction>(N.Scalars[0]);
	if (!Inst)
	continue;

	if (!PrevInst) {
	PrevInst = Inst;
	continue;
	}

	// Update LiveValues.
	LiveValues.erase(PrevInst);
	for (auto &J : PrevInst->operands()) {
	if (isa<Instruction>(&J) && ScalarToTreeEntry.count(&J))
	LiveValues.insert(cast<Instruction>(&*J));
	}

	DEBUG(
	dbgs() << "SLP: #LV: " << LiveValues.size();
	for (auto *X : LiveValues)
	dbgs() << " " << X->getName();
	dbgs() << ", Looking at ";
	Inst->dump();
	);

	// Now find the sequence of instructions between PrevInst and Inst.
	BasicBlock::reverse_iterator InstIt = ++Inst->getIterator().getReverse(),
	PrevInstIt =
	PrevInst->getIterator().getReverse();
	while (InstIt != PrevInstIt) {
	if (PrevInstIt == PrevInst->getParent()->rend()) {
	PrevInstIt = Inst->getParent()->rbegin();
	continue;
	}

	if (isa<CallInst>(&PrevInstIt) && &PrevInstIt != PrevInst) {
	SmallVector<Type*, 4> V;
	for (auto *II : LiveValues)
	V.push_back(VectorType::get(II->getType(), BundleWidth));
	Cost += TTI->getCostOfKeepingLiveOverCall(V);
	}

	++PrevInstIt;
	}

	PrevInst = Inst;
	}

	return Cost;
	}

	int BoUpSLP::getTreeCost() {
	int Cost = 0;
	DEBUG(dbgs() << "SLP: Calculating cost for tree of size " <<
	VectorizableTree.size() << ".\n");

	unsigned BundleWidth = VectorizableTree[0].Scalars.size();

	for (TreeEntry &TE : VectorizableTree) {
	int C = getEntryCost(&TE);
	DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle that starts with "
	<< *TE.Scalars[0] << ".\n");
	Cost += C;
	}

	SmallSet<Value *, 16> ExtractCostCalculated;
	int ExtractCost = 0;
	for (ExternalUser &EU : ExternalUses) {
	// We only add extract cost once for the same scalar.
	if (!ExtractCostCalculated.insert(EU.Scalar).second)
	continue;

	// Uses by ephemeral values are free (because the ephemeral value will be
	// removed prior to code generation, and so the extraction will be
	// removed as well).
	if (EphValues.count(EU.User))
	continue;

	// If we plan to rewrite the tree in a smaller type, we will need to sign
	// extend the extracted value back to the original type. Here, we account
	// for the extract and the added cost of the sign extend if needed.
	auto *VecTy = VectorType::get(EU.Scalar->getType(), BundleWidth);
	auto *ScalarRoot = VectorizableTree[0].Scalars[0];
	if (MinBWs.count(ScalarRoot)) {
	auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot].first);
	auto Extend =
	MinBWs[ScalarRoot].second ? Instruction::SExt : Instruction::ZExt;
	VecTy = VectorType::get(MinTy, BundleWidth);
	ExtractCost += TTI->getExtractWithExtendCost(Extend, EU.Scalar->getType(),
	VecTy, EU.Lane);
	} else {
	ExtractCost +=
	TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, EU.Lane);
	}
	}

	int SpillCost = getSpillCost();
	Cost += SpillCost + ExtractCost;

	DEBUG(dbgs() << "SLP: Spill Cost = " << SpillCost << ".\n"
	<< "SLP: Extract Cost = " << ExtractCost << ".\n"
	<< "SLP: Total Cost = " << Cost << ".\n");
	return Cost;
	}

	int BoUpSLP::getGatherCost(Type *Ty) {
	int Cost = 0;
	for (unsigned i = 0, e = cast<VectorType>(Ty)->getNumElements(); i < e; ++i)
	Cost += TTI->getVectorInstrCost(Instruction::InsertElement, Ty, i);
	return Cost;
	}

	int BoUpSLP::getGatherCost(ArrayRef<Value *> VL) {
	// Find the type of the operands in VL.
	Type *ScalarTy = VL[0]->getType();
	if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
	ScalarTy = SI->getValueOperand()->getType();
	VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
	// Find the cost of inserting/extracting values from the vector.
	return getGatherCost(VecTy);
	}

	// Reorder commutative operations in alternate shuffle if the resulting vectors
	// are consecutive loads. This would allow us to vectorize the tree.
	// If we have something like-
	// load a[0] - load b[0]
	// load b[1] + load a[1]
	// load a[2] - load b[2]
	// load a[3] + load b[3]
	// Reordering the second load b[1] load a[1] would allow us to vectorize this
	// code.
	void BoUpSLP::reorderAltShuffleOperands(ArrayRef<Value *> VL,
	SmallVectorImpl<Value *> &Left,
	SmallVectorImpl<Value *> &Right) {
	// Push left and right operands of binary operation into Left and Right
	for (Value *i : VL) {
	Left.push_back(cast<Instruction>(i)->getOperand(0));
	Right.push_back(cast<Instruction>(i)->getOperand(1));
	}

	// Reorder if we have a commutative operation and consecutive access
	// are on either side of the alternate instructions.
	for (unsigned j = 0; j < VL.size() - 1; ++j) {
	if (LoadInst *L = dyn_cast<LoadInst>(Left[j])) {
	if (LoadInst *L1 = dyn_cast<LoadInst>(Right[j + 1])) {
	Instruction *VL1 = cast<Instruction>(VL[j]);
	Instruction *VL2 = cast<Instruction>(VL[j + 1]);
	if (VL1->isCommutative() && isConsecutiveAccess(L, L1, DL, SE)) {
	std::swap(Left[j], Right[j]);
	continue;
	} else if (VL2->isCommutative() &&
	isConsecutiveAccess(L, L1, DL, SE)) {
	std::swap(Left[j + 1], Right[j + 1]);
	continue;
	}
	// else unchanged
	}
	}
	if (LoadInst *L = dyn_cast<LoadInst>(Right[j])) {
	if (LoadInst *L1 = dyn_cast<LoadInst>(Left[j + 1])) {
	Instruction *VL1 = cast<Instruction>(VL[j]);
	Instruction *VL2 = cast<Instruction>(VL[j + 1]);
	if (VL1->isCommutative() && isConsecutiveAccess(L, L1, DL, SE)) {
	std::swap(Left[j], Right[j]);
	continue;
	} else if (VL2->isCommutative() &&
	isConsecutiveAccess(L, L1, DL, SE)) {
	std::swap(Left[j + 1], Right[j + 1]);
	continue;
	}
	// else unchanged
	}
	}
	}
	}

	// Return true if I should be commuted before adding it's left and right
	// operands to the arrays Left and Right.
	//
	// The vectorizer is trying to either have all elements one side being
	// instruction with the same opcode to enable further vectorization, or having
	// a splat to lower the vectorizing cost.
	static bool shouldReorderOperands(int i, Instruction &I,
	SmallVectorImpl<Value *> &Left,
	SmallVectorImpl<Value *> &Right,
	bool AllSameOpcodeLeft,
	bool AllSameOpcodeRight, bool SplatLeft,
	bool SplatRight) {
	Value *VLeft = I.getOperand(0);
	Value *VRight = I.getOperand(1);
	// If we have "SplatRight", try to see if commuting is needed to preserve it.
	if (SplatRight) {
	if (VRight == Right[i - 1])
	// Preserve SplatRight
	return false;
	if (VLeft == Right[i - 1]) {
	// Commuting would preserve SplatRight, but we don't want to break
	// SplatLeft either, i.e. preserve the original order if possible.
	// (FIXME: why do we care?)
	if (SplatLeft && VLeft == Left[i - 1])
	return false;
	return true;
	}
	}
	// Symmetrically handle Right side.
	if (SplatLeft) {
	if (VLeft == Left[i - 1])
	// Preserve SplatLeft
	return false;
	if (VRight == Left[i - 1])
	return true;
	}

	Instruction *ILeft = dyn_cast<Instruction>(VLeft);
	Instruction *IRight = dyn_cast<Instruction>(VRight);

	// If we have "AllSameOpcodeRight", try to see if the left operands preserves
	// it and not the right, in this case we want to commute.
	if (AllSameOpcodeRight) {
	unsigned RightPrevOpcode = cast<Instruction>(Right[i - 1])->getOpcode();
	if (IRight && RightPrevOpcode == IRight->getOpcode())
	// Do not commute, a match on the right preserves AllSameOpcodeRight
	return false;
	if (ILeft && RightPrevOpcode == ILeft->getOpcode()) {
	// We have a match and may want to commute, but first check if there is
	// not also a match on the existing operands on the Left to preserve
	// AllSameOpcodeLeft, i.e. preserve the original order if possible.
	// (FIXME: why do we care?)
	if (AllSameOpcodeLeft && ILeft &&
	cast<Instruction>(Left[i - 1])->getOpcode() == ILeft->getOpcode())
	return false;
	return true;
	}
	}
	// Symmetrically handle Left side.
	if (AllSameOpcodeLeft) {
	unsigned LeftPrevOpcode = cast<Instruction>(Left[i - 1])->getOpcode();
	if (ILeft && LeftPrevOpcode == ILeft->getOpcode())
	return false;
	if (IRight && LeftPrevOpcode == IRight->getOpcode())
	return true;
	}
	return false;
	}

	void BoUpSLP::reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
	SmallVectorImpl<Value *> &Left,
	SmallVectorImpl<Value *> &Right) {

	if (VL.size()) {
	// Peel the first iteration out of the loop since there's nothing
	// interesting to do anyway and it simplifies the checks in the loop.
	auto VLeft = cast<Instruction>(VL[0])->getOperand(0);
	auto VRight = cast<Instruction>(VL[0])->getOperand(1);
	if (!isa<Instruction>(VRight) && isa<Instruction>(VLeft))
	// Favor having instruction to the right. FIXME: why?
	std::swap(VLeft, VRight);
	Left.push_back(VLeft);
	Right.push_back(VRight);
	}

	// Keep track if we have instructions with all the same opcode on one side.
	bool AllSameOpcodeLeft = isa<Instruction>(Left[0]);
	bool AllSameOpcodeRight = isa<Instruction>(Right[0]);
	// Keep track if we have one side with all the same value (broadcast).
	bool SplatLeft = true;
	bool SplatRight = true;

	for (unsigned i = 1, e = VL.size(); i != e; ++i) {
	Instruction *I = cast<Instruction>(VL[i]);
	assert(I->isCommutative() && "Can only process commutative instruction");
	// Commute to favor either a splat or maximizing having the same opcodes on
	// one side.
	if (shouldReorderOperands(i, *I, Left, Right, AllSameOpcodeLeft,
	AllSameOpcodeRight, SplatLeft, SplatRight)) {
	Left.push_back(I->getOperand(1));
	Right.push_back(I->getOperand(0));
	} else {
	Left.push_back(I->getOperand(0));
	Right.push_back(I->getOperand(1));
	}
	// Update Splat* and AllSameOpcode* after the insertion.
	SplatRight = SplatRight && (Right[i - 1] == Right[i]);
	SplatLeft = SplatLeft && (Left[i - 1] == Left[i]);
	AllSameOpcodeLeft = AllSameOpcodeLeft && isa<Instruction>(Left[i]) &&
	(cast<Instruction>(Left[i - 1])->getOpcode() ==
	cast<Instruction>(Left[i])->getOpcode());
	AllSameOpcodeRight = AllSameOpcodeRight && isa<Instruction>(Right[i]) &&
	(cast<Instruction>(Right[i - 1])->getOpcode() ==
	cast<Instruction>(Right[i])->getOpcode());
	}

	// If one operand end up being broadcast, return this operand order.
	if (SplatRight \|\| SplatLeft)
	return;

	// Finally check if we can get longer vectorizable chain by reordering
	// without breaking the good operand order detected above.
	// E.g. If we have something like-
	// load a[0] load b[0]
	// load b[1] load a[1]
	// load a[2] load b[2]
	// load a[3] load b[3]
	// Reordering the second load b[1] load a[1] would allow us to vectorize
	// this code and we still retain AllSameOpcode property.
	// FIXME: This load reordering might break AllSameOpcode in some rare cases
	// such as-
	// add a[0],c[0] load b[0]
	// add a[1],c[2] load b[1]
	// b[2] load b[2]
	// add a[3],c[3] load b[3]
	for (unsigned j = 0; j < VL.size() - 1; ++j) {
	if (LoadInst *L = dyn_cast<LoadInst>(Left[j])) {
	if (LoadInst *L1 = dyn_cast<LoadInst>(Right[j + 1])) {
	if (isConsecutiveAccess(L, L1, DL, SE)) {
	std::swap(Left[j + 1], Right[j + 1]);
	continue;
	}
	}
	}
	if (LoadInst *L = dyn_cast<LoadInst>(Right[j])) {
	if (LoadInst *L1 = dyn_cast<LoadInst>(Left[j + 1])) {
	if (isConsecutiveAccess(L, L1, DL, SE)) {
	std::swap(Left[j + 1], Right[j + 1]);
	continue;
	}
	}
	}
	// else unchanged
	}
	}

	void BoUpSLP::setInsertPointAfterBundle(ArrayRef<Value *> VL) {

	// Get the basic block this bundle is in. All instructions in the bundle
	// should be in this block.
	auto *Front = cast<Instruction>(VL.front());
	auto *BB = Front->getParent();
	assert(all_of(make_range(VL.begin(), VL.end()), [&](Value *V) -> bool {
	return cast<Instruction>(V)->getParent() == BB;
	}));

	// The last instruction in the bundle in program order.
	Instruction *LastInst = nullptr;

	// Find the last instruction. The common case should be that BB has been
	// scheduled, and the last instruction is VL.back(). So we start with
	// VL.back() and iterate over schedule data until we reach the end of the
	// bundle. The end of the bundle is marked by null ScheduleData.
	if (BlocksSchedules.count(BB)) {
	auto *Bundle = BlocksSchedules[BB]->getScheduleData(VL.back());
	if (Bundle && Bundle->isPartOfBundle())
	for (; Bundle; Bundle = Bundle->NextInBundle)
	LastInst = Bundle->Inst;
	}

	// LastInst can still be null at this point if there's either not an entry
	// for BB in BlocksSchedules or there's no ScheduleData available for
	// VL.back(). This can be the case if buildTree_rec aborts for various
	// reasons (e.g., the maximum recursion depth is reached, the maximum region
	// size is reached, etc.). ScheduleData is initialized in the scheduling
	// "dry-run".
	//
	// If this happens, we can still find the last instruction by brute force. We
	// iterate forwards from Front (inclusive) until we either see all
	// instructions in the bundle or reach the end of the block. If Front is the
	// last instruction in program order, LastInst will be set to Front, and we
	// will visit all the remaining instructions in the block.
	//
	// One of the reasons we exit early from buildTree_rec is to place an upper
	// bound on compile-time. Thus, taking an additional compile-time hit here is
	// not ideal. However, this should be exceedingly rare since it requires that
	// we both exit early from buildTree_rec and that the bundle be out-of-order
	// (causing us to iterate all the way to the end of the block).
	if (!LastInst) {
	SmallPtrSet<Value *, 16> Bundle(VL.begin(), VL.end());
	for (auto &I : make_range(BasicBlock::iterator(Front), BB->end())) {
	if (Bundle.erase(&I))
	LastInst = &I;
	if (Bundle.empty())
	break;
	}
	}

	// Set the insertion point after the last instruction in the bundle. Set the
	// debug location to Front.
	Builder.SetInsertPoint(BB, ++LastInst->getIterator());
	Builder.SetCurrentDebugLocation(Front->getDebugLoc());
	}

	Value BoUpSLP::Gather(ArrayRef<Value > VL, VectorType *Ty) {
	Value *Vec = UndefValue::get(Ty);
	// Generate the 'InsertElement' instruction.
	for (unsigned i = 0; i < Ty->getNumElements(); ++i) {
	Vec = Builder.CreateInsertElement(Vec, VL[i], Builder.getInt32(i));
	if (Instruction *Insrt = dyn_cast<Instruction>(Vec)) {
	GatherSeq.insert(Insrt);
	CSEBlocks.insert(Insrt->getParent());

	// Add to our 'need-to-extract' list.
	if (ScalarToTreeEntry.count(VL[i])) {
	int Idx = ScalarToTreeEntry[VL[i]];
	TreeEntry *E = &VectorizableTree[Idx];
	// Find which lane we need to extract.
	int FoundLane = -1;
	for (unsigned Lane = 0, LE = VL.size(); Lane != LE; ++Lane) {
	// Is this the lane of the scalar that we are looking for ?
	if (E->Scalars[Lane] == VL[i]) {
	FoundLane = Lane;
	break;
	}
	}
	assert(FoundLane >= 0 && "Could not find the correct lane");
	ExternalUses.push_back(ExternalUser(VL[i], Insrt, FoundLane));
	}
	}
	}

	return Vec;
	}

	Value BoUpSLP::alreadyVectorized(ArrayRef<Value > VL) const {
	SmallDenseMap<Value*, int>::const_iterator Entry
	= ScalarToTreeEntry.find(VL[0]);
	if (Entry != ScalarToTreeEntry.end()) {
	int Idx = Entry->second;
	const TreeEntry *En = &VectorizableTree[Idx];
	if (En->isSame(VL) && En->VectorizedValue)
	return En->VectorizedValue;
	}
	return nullptr;
	}

	Value BoUpSLP::vectorizeTree(ArrayRef<Value > VL) {
	if (ScalarToTreeEntry.count(VL[0])) {
	int Idx = ScalarToTreeEntry[VL[0]];
	TreeEntry *E = &VectorizableTree[Idx];
	if (E->isSame(VL))
	return vectorizeTree(E);
	}

	Type *ScalarTy = VL[0]->getType();
	if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
	ScalarTy = SI->getValueOperand()->getType();
	VectorType *VecTy = VectorType::get(ScalarTy, VL.size());

	return Gather(VL, VecTy);
	}

	Value BoUpSLP::vectorizeTree(TreeEntry E) {
	IRBuilder<>::InsertPointGuard Guard(Builder);

	if (E->VectorizedValue) {
	DEBUG(dbgs() << "SLP: Diamond merged for " << *E->Scalars[0] << ".\n");
	return E->VectorizedValue;
	}

	Instruction *VL0 = cast<Instruction>(E->Scalars[0]);
	Type *ScalarTy = VL0->getType();
	if (StoreInst *SI = dyn_cast<StoreInst>(VL0))
	ScalarTy = SI->getValueOperand()->getType();
	VectorType *VecTy = VectorType::get(ScalarTy, E->Scalars.size());

	if (E->NeedToGather) {
	setInsertPointAfterBundle(E->Scalars);
	auto *V = Gather(E->Scalars, VecTy);
	E->VectorizedValue = V;
	return V;
	}

	unsigned Opcode = getSameOpcode(E->Scalars);

	switch (Opcode) {
	case Instruction::PHI: {
	PHINode *PH = dyn_cast<PHINode>(VL0);
	Builder.SetInsertPoint(PH->getParent()->getFirstNonPHI());
	Builder.SetCurrentDebugLocation(PH->getDebugLoc());
	PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues());
	E->VectorizedValue = NewPhi;

	// PHINodes may have multiple entries from the same block. We want to
	// visit every block once.
	SmallSet<BasicBlock*, 4> VisitedBBs;

	for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
	ValueList Operands;
	BasicBlock *IBB = PH->getIncomingBlock(i);

	if (!VisitedBBs.insert(IBB).second) {
	NewPhi->addIncoming(NewPhi->getIncomingValueForBlock(IBB), IBB);
	continue;
	}

	// Prepare the operand vector.
	for (Value *V : E->Scalars)
	Operands.push_back(cast<PHINode>(V)->getIncomingValueForBlock(IBB));

	Builder.SetInsertPoint(IBB->getTerminator());
	Builder.SetCurrentDebugLocation(PH->getDebugLoc());
	Value *Vec = vectorizeTree(Operands);
	NewPhi->addIncoming(Vec, IBB);
	}

	assert(NewPhi->getNumIncomingValues() == PH->getNumIncomingValues() &&
	"Invalid number of incoming values");
	return NewPhi;
	}

	case Instruction::ExtractElement: {
	if (canReuseExtract(E->Scalars, Instruction::ExtractElement)) {
	Value *V = VL0->getOperand(0);
	E->VectorizedValue = V;
	return V;
	}
	setInsertPointAfterBundle(E->Scalars);
	auto *V = Gather(E->Scalars, VecTy);
	E->VectorizedValue = V;
	return V;
	}
	case Instruction::ExtractValue: {
	if (canReuseExtract(E->Scalars, Instruction::ExtractValue)) {
	LoadInst *LI = cast<LoadInst>(VL0->getOperand(0));
	Builder.SetInsertPoint(LI);
	PointerType *PtrTy = PointerType::get(VecTy, LI->getPointerAddressSpace());
	Value *Ptr = Builder.CreateBitCast(LI->getOperand(0), PtrTy);
	LoadInst *V = Builder.CreateAlignedLoad(Ptr, LI->getAlignment());
	E->VectorizedValue = V;
	return propagateMetadata(V, E->Scalars);
	}
	setInsertPointAfterBundle(E->Scalars);
	auto *V = Gather(E->Scalars, VecTy);
	E->VectorizedValue = V;
	return V;
	}
	case Instruction::ZExt:
	case Instruction::SExt:
	case Instruction::FPToUI:
	case Instruction::FPToSI:
	case Instruction::FPExt:
	case Instruction::PtrToInt:
	case Instruction::IntToPtr:
	case Instruction::SIToFP:
	case Instruction::UIToFP:
	case Instruction::Trunc:
	case Instruction::FPTrunc:
	case Instruction::BitCast: {
	ValueList INVL;
	for (Value *V : E->Scalars)
	INVL.push_back(cast<Instruction>(V)->getOperand(0));

	setInsertPointAfterBundle(E->Scalars);

	Value *InVec = vectorizeTree(INVL);

	if (Value *V = alreadyVectorized(E->Scalars))
	return V;

	CastInst *CI = dyn_cast<CastInst>(VL0);
	Value *V = Builder.CreateCast(CI->getOpcode(), InVec, VecTy);
	E->VectorizedValue = V;
	++NumVectorInstructions;
	return V;
	}
	case Instruction::FCmp:
	case Instruction::ICmp: {
	ValueList LHSV, RHSV;
	for (Value *V : E->Scalars) {
	LHSV.push_back(cast<Instruction>(V)->getOperand(0));
	RHSV.push_back(cast<Instruction>(V)->getOperand(1));
	}

	setInsertPointAfterBundle(E->Scalars);

	Value *L = vectorizeTree(LHSV);
	Value *R = vectorizeTree(RHSV);

	if (Value *V = alreadyVectorized(E->Scalars))
	return V;

	CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
	Value *V;
	if (Opcode == Instruction::FCmp)
	V = Builder.CreateFCmp(P0, L, R);
	else
	V = Builder.CreateICmp(P0, L, R);

	E->VectorizedValue = V;
	propagateIRFlags(E->VectorizedValue, E->Scalars);
	++NumVectorInstructions;
	return V;
	}
	case Instruction::Select: {
	ValueList TrueVec, FalseVec, CondVec;
	for (Value *V : E->Scalars) {
	CondVec.push_back(cast<Instruction>(V)->getOperand(0));
	TrueVec.push_back(cast<Instruction>(V)->getOperand(1));
	FalseVec.push_back(cast<Instruction>(V)->getOperand(2));
	}

	setInsertPointAfterBundle(E->Scalars);

	Value *Cond = vectorizeTree(CondVec);
	Value *True = vectorizeTree(TrueVec);
	Value *False = vectorizeTree(FalseVec);

	if (Value *V = alreadyVectorized(E->Scalars))
	return V;

	Value *V = Builder.CreateSelect(Cond, True, False);
	E->VectorizedValue = V;
	++NumVectorInstructions;
	return V;
	}
	case Instruction::Add:
	case Instruction::FAdd:
	case Instruction::Sub:
	case Instruction::FSub:
	case Instruction::Mul:
	case Instruction::FMul:
	case Instruction::UDiv:
	case Instruction::SDiv:
	case Instruction::FDiv:
	case Instruction::URem:
	case Instruction::SRem:
	case Instruction::FRem:
	case Instruction::Shl:
	case Instruction::LShr:
	case Instruction::AShr:
	case Instruction::And:
	case Instruction::Or:
	case Instruction::Xor: {
	ValueList LHSVL, RHSVL;
	if (isa<BinaryOperator>(VL0) && VL0->isCommutative())
	reorderInputsAccordingToOpcode(E->Scalars, LHSVL, RHSVL);
	else
	for (Value *V : E->Scalars) {
	LHSVL.push_back(cast<Instruction>(V)->getOperand(0));
	RHSVL.push_back(cast<Instruction>(V)->getOperand(1));
	}

	setInsertPointAfterBundle(E->Scalars);

	Value *LHS = vectorizeTree(LHSVL);
	Value *RHS = vectorizeTree(RHSVL);

	if (Value *V = alreadyVectorized(E->Scalars))
	return V;

	BinaryOperator *BinOp = cast<BinaryOperator>(VL0);
	Value *V = Builder.CreateBinOp(BinOp->getOpcode(), LHS, RHS);
	E->VectorizedValue = V;
	propagateIRFlags(E->VectorizedValue, E->Scalars);
	++NumVectorInstructions;

	if (Instruction *I = dyn_cast<Instruction>(V))
	return propagateMetadata(I, E->Scalars);

	return V;
	}
	case Instruction::Load: {
	// Loads are inserted at the head of the tree because we don't want to
	// sink them all the way down past store instructions.
	setInsertPointAfterBundle(E->Scalars);

	LoadInst *LI = cast<LoadInst>(VL0);
	Type *ScalarLoadTy = LI->getType();
	unsigned AS = LI->getPointerAddressSpace();

	Value *VecPtr = Builder.CreateBitCast(LI->getPointerOperand(),
	VecTy->getPointerTo(AS));

	// The pointer operand uses an in-tree scalar so we add the new BitCast to
	// ExternalUses list to make sure that an extract will be generated in the
	// future.
	if (ScalarToTreeEntry.count(LI->getPointerOperand()))
	ExternalUses.push_back(
	ExternalUser(LI->getPointerOperand(), cast<User>(VecPtr), 0));

	unsigned Alignment = LI->getAlignment();
	LI = Builder.CreateLoad(VecPtr);
	if (!Alignment) {
	Alignment = DL->getABITypeAlignment(ScalarLoadTy);
	}
	LI->setAlignment(Alignment);
	E->VectorizedValue = LI;
	++NumVectorInstructions;
	return propagateMetadata(LI, E->Scalars);
	}
	case Instruction::Store: {
	StoreInst *SI = cast<StoreInst>(VL0);
	unsigned Alignment = SI->getAlignment();
	unsigned AS = SI->getPointerAddressSpace();

	ValueList ValueOp;
	for (Value *V : E->Scalars)
	ValueOp.push_back(cast<StoreInst>(V)->getValueOperand());

	setInsertPointAfterBundle(E->Scalars);

	Value *VecValue = vectorizeTree(ValueOp);
	Value *VecPtr = Builder.CreateBitCast(SI->getPointerOperand(),
	VecTy->getPointerTo(AS));
	StoreInst *S = Builder.CreateStore(VecValue, VecPtr);

	// The pointer operand uses an in-tree scalar so we add the new BitCast to
	// ExternalUses list to make sure that an extract will be generated in the
	// future.
	if (ScalarToTreeEntry.count(SI->getPointerOperand()))
	ExternalUses.push_back(
	ExternalUser(SI->getPointerOperand(), cast<User>(VecPtr), 0));

	if (!Alignment) {
	Alignment = DL->getABITypeAlignment(SI->getValueOperand()->getType());
	}
	S->setAlignment(Alignment);
	E->VectorizedValue = S;
	++NumVectorInstructions;
	return propagateMetadata(S, E->Scalars);
	}
	case Instruction::GetElementPtr: {
	setInsertPointAfterBundle(E->Scalars);

	ValueList Op0VL;
	for (Value *V : E->Scalars)
	Op0VL.push_back(cast<GetElementPtrInst>(V)->getOperand(0));

	Value *Op0 = vectorizeTree(Op0VL);

	std::vector<Value *> OpVecs;
	for (int j = 1, e = cast<GetElementPtrInst>(VL0)->getNumOperands(); j < e;
	++j) {
	ValueList OpVL;
	for (Value *V : E->Scalars)
	OpVL.push_back(cast<GetElementPtrInst>(V)->getOperand(j));

	Value *OpVec = vectorizeTree(OpVL);
	OpVecs.push_back(OpVec);
	}

	Value *V = Builder.CreateGEP(
	cast<GetElementPtrInst>(VL0)->getSourceElementType(), Op0, OpVecs);
	E->VectorizedValue = V;
	++NumVectorInstructions;

	if (Instruction *I = dyn_cast<Instruction>(V))
	return propagateMetadata(I, E->Scalars);

	return V;
	}
	case Instruction::Call: {
	CallInst *CI = cast<CallInst>(VL0);
	setInsertPointAfterBundle(E->Scalars);
	Function *FI;
	Intrinsic::ID IID = Intrinsic::not_intrinsic;
	Value *ScalarArg = nullptr;
	if (CI && (FI = CI->getCalledFunction())) {
	IID = FI->getIntrinsicID();
	}
	std::vector<Value *> OpVecs;
	for (int j = 0, e = CI->getNumArgOperands(); j < e; ++j) {
	ValueList OpVL;
	// ctlz,cttz and powi are special intrinsics whose second argument is
	// a scalar. This argument should not be vectorized.
	if (hasVectorInstrinsicScalarOpd(IID, 1) && j == 1) {
	CallInst *CEI = cast<CallInst>(E->Scalars[0]);
	ScalarArg = CEI->getArgOperand(j);
	OpVecs.push_back(CEI->getArgOperand(j));
	continue;
	}
	for (Value *V : E->Scalars) {
	CallInst *CEI = cast<CallInst>(V);
	OpVL.push_back(CEI->getArgOperand(j));
	}

	Value *OpVec = vectorizeTree(OpVL);
	DEBUG(dbgs() << "SLP: OpVec[" << j << "]: " << *OpVec << "\n");
	OpVecs.push_back(OpVec);
	}

	Module *M = F->getParent();
	Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
	Type *Tys[] = { VectorType::get(CI->getType(), E->Scalars.size()) };
	Function *CF = Intrinsic::getDeclaration(M, ID, Tys);
	SmallVector<OperandBundleDef, 1> OpBundles;
	CI->getOperandBundlesAsDefs(OpBundles);
	Value *V = Builder.CreateCall(CF, OpVecs, OpBundles);

	// The scalar argument uses an in-tree scalar so we add the new vectorized
	// call to ExternalUses list to make sure that an extract will be
	// generated in the future.
	if (ScalarArg && ScalarToTreeEntry.count(ScalarArg))
	ExternalUses.push_back(ExternalUser(ScalarArg, cast<User>(V), 0));

	E->VectorizedValue = V;
	propagateIRFlags(E->VectorizedValue, E->Scalars);
	++NumVectorInstructions;
	return V;
	}
	case Instruction::ShuffleVector: {
	ValueList LHSVL, RHSVL;
	assert(isa<BinaryOperator>(VL0) && "Invalid Shuffle Vector Operand");
	reorderAltShuffleOperands(E->Scalars, LHSVL, RHSVL);
	setInsertPointAfterBundle(E->Scalars);

	Value *LHS = vectorizeTree(LHSVL);
	Value *RHS = vectorizeTree(RHSVL);

	if (Value *V = alreadyVectorized(E->Scalars))
	return V;

	// Create a vector of LHS op1 RHS
	BinaryOperator *BinOp0 = cast<BinaryOperator>(VL0);
	Value *V0 = Builder.CreateBinOp(BinOp0->getOpcode(), LHS, RHS);

	// Create a vector of LHS op2 RHS
	Instruction *VL1 = cast<Instruction>(E->Scalars[1]);
	BinaryOperator *BinOp1 = cast<BinaryOperator>(VL1);
	Value *V1 = Builder.CreateBinOp(BinOp1->getOpcode(), LHS, RHS);

	// Create shuffle to take alternate operations from the vector.
	// Also, gather up odd and even scalar ops to propagate IR flags to
	// each vector operation.
	ValueList OddScalars, EvenScalars;
	unsigned e = E->Scalars.size();
	SmallVector<Constant *, 8> Mask(e);
	for (unsigned i = 0; i < e; ++i) {
	if (i & 1) {
	Mask[i] = Builder.getInt32(e + i);
	OddScalars.push_back(E->Scalars[i]);
	} else {
	Mask[i] = Builder.getInt32(i);
	EvenScalars.push_back(E->Scalars[i]);
	}
	}

	Value *ShuffleMask = ConstantVector::get(Mask);
	propagateIRFlags(V0, EvenScalars);
	propagateIRFlags(V1, OddScalars);

	Value *V = Builder.CreateShuffleVector(V0, V1, ShuffleMask);
	E->VectorizedValue = V;
	++NumVectorInstructions;
	if (Instruction *I = dyn_cast<Instruction>(V))
	return propagateMetadata(I, E->Scalars);

	return V;
	}
	default:
	llvm_unreachable("unknown inst");
	}
	return nullptr;
	}

	Value *BoUpSLP::vectorizeTree() {

	// All blocks must be scheduled before any instructions are inserted.
	for (auto &BSIter : BlocksSchedules) {
	scheduleBlock(BSIter.second.get());
	}

	Builder.SetInsertPoint(&F->getEntryBlock().front());
	auto *VectorRoot = vectorizeTree(&VectorizableTree[0]);

	// If the vectorized tree can be rewritten in a smaller type, we truncate the
	// vectorized root. InstCombine will then rewrite the entire expression. We
	// sign extend the extracted values below.
	auto *ScalarRoot = VectorizableTree[0].Scalars[0];
	if (MinBWs.count(ScalarRoot)) {
	if (auto *I = dyn_cast<Instruction>(VectorRoot))
	Builder.SetInsertPoint(&*++BasicBlock::iterator(I));
	auto BundleWidth = VectorizableTree[0].Scalars.size();
	auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot].first);
	auto *VecTy = VectorType::get(MinTy, BundleWidth);
	auto *Trunc = Builder.CreateTrunc(VectorRoot, VecTy);
	VectorizableTree[0].VectorizedValue = Trunc;
	}

	DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size() << " values .\n");

	// If necessary, sign-extend or zero-extend ScalarRoot to the larger type
	// specified by ScalarType.
	auto extend = [&](Value ScalarRoot, Value Ex, Type *ScalarType) {
	if (!MinBWs.count(ScalarRoot))
	return Ex;
	if (MinBWs[ScalarRoot].second)
	return Builder.CreateSExt(Ex, ScalarType);
	return Builder.CreateZExt(Ex, ScalarType);
	};

	// Extract all of the elements with the external uses.
	for (const auto &ExternalUse : ExternalUses) {
	Value *Scalar = ExternalUse.Scalar;
	llvm::User *User = ExternalUse.User;

	// Skip users that we already RAUW. This happens when one instruction
	// has multiple uses of the same value.
	if (!is_contained(Scalar->users(), User))
	continue;
	assert(ScalarToTreeEntry.count(Scalar) && "Invalid scalar");

	int Idx = ScalarToTreeEntry[Scalar];
	TreeEntry *E = &VectorizableTree[Idx];
	assert(!E->NeedToGather && "Extracting from a gather list");

	Value *Vec = E->VectorizedValue;
	assert(Vec && "Can't find vectorizable value");

	Value *Lane = Builder.getInt32(ExternalUse.Lane);
	// Generate extracts for out-of-tree users.
	// Find the insertion point for the extractelement lane.
	if (auto *VecI = dyn_cast<Instruction>(Vec)) {
	if (PHINode *PH = dyn_cast<PHINode>(User)) {
	for (int i = 0, e = PH->getNumIncomingValues(); i != e; ++i) {
	if (PH->getIncomingValue(i) == Scalar) {
	TerminatorInst *IncomingTerminator =
	PH->getIncomingBlock(i)->getTerminator();
	if (isa<CatchSwitchInst>(IncomingTerminator)) {
	Builder.SetInsertPoint(VecI->getParent(),
	std::next(VecI->getIterator()));
	} else {
	Builder.SetInsertPoint(PH->getIncomingBlock(i)->getTerminator());
	}
	Value *Ex = Builder.CreateExtractElement(Vec, Lane);
	Ex = extend(ScalarRoot, Ex, Scalar->getType());
	CSEBlocks.insert(PH->getIncomingBlock(i));
	PH->setOperand(i, Ex);
	}
	}
	} else {
	Builder.SetInsertPoint(cast<Instruction>(User));
	Value *Ex = Builder.CreateExtractElement(Vec, Lane);
	Ex = extend(ScalarRoot, Ex, Scalar->getType());
	CSEBlocks.insert(cast<Instruction>(User)->getParent());
	User->replaceUsesOfWith(Scalar, Ex);
	}
	} else {
	Builder.SetInsertPoint(&F->getEntryBlock().front());
	Value *Ex = Builder.CreateExtractElement(Vec, Lane);
	Ex = extend(ScalarRoot, Ex, Scalar->getType());
	CSEBlocks.insert(&F->getEntryBlock());
	User->replaceUsesOfWith(Scalar, Ex);
	}

	DEBUG(dbgs() << "SLP: Replaced:" << *User << ".\n");
	}

	// For each vectorized value:
	for (TreeEntry &EIdx : VectorizableTree) {
	TreeEntry *Entry = &EIdx;

	// For each lane:
	for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
	Value *Scalar = Entry->Scalars[Lane];
	// No need to handle users of gathered values.
	if (Entry->NeedToGather)
	continue;

	assert(Entry->VectorizedValue && "Can't find vectorizable value");

	Type *Ty = Scalar->getType();
	if (!Ty->isVoidTy()) {
	#ifndef NDEBUG
	for (User *U : Scalar->users()) {
	DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n");

	assert((ScalarToTreeEntry.count(U) \|\|
	// It is legal to replace users in the ignorelist by undef.
	is_contained(UserIgnoreList, U)) &&
	"Replacing out-of-tree value with undef");
	}
	#endif
	Value *Undef = UndefValue::get(Ty);
	Scalar->replaceAllUsesWith(Undef);
	}
	DEBUG(dbgs() << "SLP: \tErasing scalar:" << *Scalar << ".\n");
	eraseInstruction(cast<Instruction>(Scalar));
	}
	}

	Builder.ClearInsertionPoint();

	return VectorizableTree[0].VectorizedValue;
	}

	void BoUpSLP::optimizeGatherSequence() {
	DEBUG(dbgs() << "SLP: Optimizing " << GatherSeq.size()
	<< " gather sequences instructions.\n");
	// LICM InsertElementInst sequences.
	for (Instruction *it : GatherSeq) {
	InsertElementInst *Insert = dyn_cast<InsertElementInst>(it);

	if (!Insert)
	continue;

	// Check if this block is inside a loop.
	Loop *L = LI->getLoopFor(Insert->getParent());
	if (!L)
	continue;

	// Check if it has a preheader.
	BasicBlock *PreHeader = L->getLoopPreheader();
	if (!PreHeader)
	continue;

	// If the vector or the element that we insert into it are
	// instructions that are defined in this basic block then we can't
	// hoist this instruction.
	Instruction *CurrVec = dyn_cast<Instruction>(Insert->getOperand(0));
	Instruction *NewElem = dyn_cast<Instruction>(Insert->getOperand(1));
	if (CurrVec && L->contains(CurrVec))
	continue;
	if (NewElem && L->contains(NewElem))
	continue;

	// We can hoist this instruction. Move it to the pre-header.
	Insert->moveBefore(PreHeader->getTerminator());
	}

	// Make a list of all reachable blocks in our CSE queue.
	SmallVector<const DomTreeNode *, 8> CSEWorkList;
	CSEWorkList.reserve(CSEBlocks.size());
	for (BasicBlock *BB : CSEBlocks)
	if (DomTreeNode *N = DT->getNode(BB)) {
	assert(DT->isReachableFromEntry(N));
	CSEWorkList.push_back(N);
	}

	// Sort blocks by domination. This ensures we visit a block after all blocks
	// dominating it are visited.
	std::stable_sort(CSEWorkList.begin(), CSEWorkList.end(),
	[this](const DomTreeNode A, const DomTreeNode B) {
	return DT->properlyDominates(A, B);
	});

	// Perform O(N^2) search over the gather sequences and merge identical
	// instructions. TODO: We can further optimize this scan if we split the
	// instructions into different buckets based on the insert lane.
	SmallVector<Instruction *, 16> Visited;
	for (auto I = CSEWorkList.begin(), E = CSEWorkList.end(); I != E; ++I) {
	assert((I == CSEWorkList.begin() \|\| !DT->dominates(I, std::prev(I))) &&
	"Worklist not sorted properly!");
	BasicBlock BB = (I)->getBlock();
	// For all instructions in blocks containing gather sequences:
	for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e;) {
	Instruction In = &it++;
	if (!isa<InsertElementInst>(In) && !isa<ExtractElementInst>(In))
	continue;

	// Check if we can replace this instruction with any of the
	// visited instructions.
	for (Instruction *v : Visited) {
	if (In->isIdenticalTo(v) &&
	DT->dominates(v->getParent(), In->getParent())) {
	In->replaceAllUsesWith(v);
	eraseInstruction(In);
	In = nullptr;
	break;
	}
	}
	if (In) {
	assert(!is_contained(Visited, In));
	Visited.push_back(In);
	}
	}
	}
	CSEBlocks.clear();
	GatherSeq.clear();
	}

	// Groups the instructions to a bundle (which is then a single scheduling entity)
	// and schedules instructions until the bundle gets ready.
	bool BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL,
	BoUpSLP *SLP) {
	if (isa<PHINode>(VL[0]))
	return true;

	// Initialize the instruction bundle.
	Instruction *OldScheduleEnd = ScheduleEnd;
	ScheduleData *PrevInBundle = nullptr;
	ScheduleData *Bundle = nullptr;
	bool ReSchedule = false;
	DEBUG(dbgs() << "SLP: bundle: " << *VL[0] << "\n");

	// Make sure that the scheduling region contains all
	// instructions of the bundle.
	for (Value *V : VL) {
	if (!extendSchedulingRegion(V))
	return false;
	}

	for (Value *V : VL) {
	ScheduleData *BundleMember = getScheduleData(V);
	assert(BundleMember &&
	"no ScheduleData for bundle member (maybe not in same basic block)");
	if (BundleMember->IsScheduled) {
	// A bundle member was scheduled as single instruction before and now
	// needs to be scheduled as part of the bundle. We just get rid of the
	// existing schedule.
	DEBUG(dbgs() << "SLP: reset schedule because " << *BundleMember
	<< " was already scheduled\n");
	ReSchedule = true;
	}
	assert(BundleMember->isSchedulingEntity() &&
	"bundle member already part of other bundle");
	if (PrevInBundle) {
	PrevInBundle->NextInBundle = BundleMember;
	} else {
	Bundle = BundleMember;
	}
	BundleMember->UnscheduledDepsInBundle = 0;
	Bundle->UnscheduledDepsInBundle += BundleMember->UnscheduledDeps;

	// Group the instructions to a bundle.
	BundleMember->FirstInBundle = Bundle;
	PrevInBundle = BundleMember;
	}
	if (ScheduleEnd != OldScheduleEnd) {
	// The scheduling region got new instructions at the lower end (or it is a
	// new region for the first bundle). This makes it necessary to
	// recalculate all dependencies.
	// It is seldom that this needs to be done a second time after adding the
	// initial bundle to the region.
	for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
	ScheduleData *SD = getScheduleData(I);
	SD->clearDependencies();
	}
	ReSchedule = true;
	}
	if (ReSchedule) {
	resetSchedule();
	initialFillReadyList(ReadyInsts);
	}

	DEBUG(dbgs() << "SLP: try schedule bundle " << *Bundle << " in block "
	<< BB->getName() << "\n");

	calculateDependencies(Bundle, true, SLP);

	// Now try to schedule the new bundle. As soon as the bundle is "ready" it
	// means that there are no cyclic dependencies and we can schedule it.
	// Note that's important that we don't "schedule" the bundle yet (see
	// cancelScheduling).
	while (!Bundle->isReady() && !ReadyInsts.empty()) {

	ScheduleData *pickedSD = ReadyInsts.back();
	ReadyInsts.pop_back();

	if (pickedSD->isSchedulingEntity() && pickedSD->isReady()) {
	schedule(pickedSD, ReadyInsts);
	}
	}
	if (!Bundle->isReady()) {
	cancelScheduling(VL);
	return false;
	}
	return true;
	}

	void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef<Value *> VL) {
	if (isa<PHINode>(VL[0]))
	return;

	ScheduleData *Bundle = getScheduleData(VL[0]);
	DEBUG(dbgs() << "SLP: cancel scheduling of " << *Bundle << "\n");
	assert(!Bundle->IsScheduled &&
	"Can't cancel bundle which is already scheduled");
	assert(Bundle->isSchedulingEntity() && Bundle->isPartOfBundle() &&
	"tried to unbundle something which is not a bundle");

	// Un-bundle: make single instructions out of the bundle.
	ScheduleData *BundleMember = Bundle;
	while (BundleMember) {
	assert(BundleMember->FirstInBundle == Bundle && "corrupt bundle links");
	BundleMember->FirstInBundle = BundleMember;
	ScheduleData *Next = BundleMember->NextInBundle;
	BundleMember->NextInBundle = nullptr;
	BundleMember->UnscheduledDepsInBundle = BundleMember->UnscheduledDeps;
	if (BundleMember->UnscheduledDepsInBundle == 0) {
	ReadyInsts.insert(BundleMember);
	}
	BundleMember = Next;
	}
	}

	bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V) {
	if (getScheduleData(V))
	return true;
	Instruction *I = dyn_cast<Instruction>(V);
	assert(I && "bundle member must be an instruction");
	assert(!isa<PHINode>(I) && "phi nodes don't need to be scheduled");
	if (!ScheduleStart) {
	// It's the first instruction in the new region.
	initScheduleData(I, I->getNextNode(), nullptr, nullptr);
	ScheduleStart = I;
	ScheduleEnd = I->getNextNode();
	assert(ScheduleEnd && "tried to vectorize a TerminatorInst?");
	DEBUG(dbgs() << "SLP: initialize schedule region to " << *I << "\n");
	return true;
	}
	// Search up and down at the same time, because we don't know if the new
	// instruction is above or below the existing scheduling region.
	BasicBlock::reverse_iterator UpIter =
	++ScheduleStart->getIterator().getReverse();
	BasicBlock::reverse_iterator UpperEnd = BB->rend();
	BasicBlock::iterator DownIter = ScheduleEnd->getIterator();
	BasicBlock::iterator LowerEnd = BB->end();
	for (;;) {
	if (++ScheduleRegionSize > ScheduleRegionSizeLimit) {
	DEBUG(dbgs() << "SLP: exceeded schedule region size limit\n");
	return false;
	}

	if (UpIter != UpperEnd) {
	if (&*UpIter == I) {
	initScheduleData(I, ScheduleStart, nullptr, FirstLoadStoreInRegion);
	ScheduleStart = I;
	DEBUG(dbgs() << "SLP: extend schedule region start to " << *I << "\n");
	return true;
	}
	UpIter++;
	}
	if (DownIter != LowerEnd) {
	if (&*DownIter == I) {
	initScheduleData(ScheduleEnd, I->getNextNode(), LastLoadStoreInRegion,
	nullptr);
	ScheduleEnd = I->getNextNode();
	assert(ScheduleEnd && "tried to vectorize a TerminatorInst?");
	DEBUG(dbgs() << "SLP: extend schedule region end to " << *I << "\n");
	return true;
	}
	DownIter++;
	}
	assert((UpIter != UpperEnd \|\| DownIter != LowerEnd) &&
	"instruction not found in block");
	}
	return true;
	}

	void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI,
	Instruction *ToI,
	ScheduleData *PrevLoadStore,
	ScheduleData *NextLoadStore) {
	ScheduleData *CurrentLoadStore = PrevLoadStore;
	for (Instruction *I = FromI; I != ToI; I = I->getNextNode()) {
	ScheduleData *SD = ScheduleDataMap[I];
	if (!SD) {
	// Allocate a new ScheduleData for the instruction.
	if (ChunkPos >= ChunkSize) {
	ScheduleDataChunks.push_back(
	llvm::make_unique<ScheduleData[]>(ChunkSize));
	ChunkPos = 0;
	}
	SD = &(ScheduleDataChunks.back()[ChunkPos++]);
	ScheduleDataMap[I] = SD;
	SD->Inst = I;
	}
	assert(!isInSchedulingRegion(SD) &&
	"new ScheduleData already in scheduling region");
	SD->init(SchedulingRegionID);

	if (I->mayReadOrWriteMemory()) {
	// Update the linked list of memory accessing instructions.
	if (CurrentLoadStore) {
	CurrentLoadStore->NextLoadStore = SD;
	} else {
	FirstLoadStoreInRegion = SD;
	}
	CurrentLoadStore = SD;
	}
	}
	if (NextLoadStore) {
	if (CurrentLoadStore)
	CurrentLoadStore->NextLoadStore = NextLoadStore;
	} else {
	LastLoadStoreInRegion = CurrentLoadStore;
	}
	}

	void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
	bool InsertInReadyList,
	BoUpSLP *SLP) {
	assert(SD->isSchedulingEntity());

	SmallVector<ScheduleData *, 10> WorkList;
	WorkList.push_back(SD);

	while (!WorkList.empty()) {
	ScheduleData *SD = WorkList.back();
	WorkList.pop_back();

	ScheduleData *BundleMember = SD;
	while (BundleMember) {
	assert(isInSchedulingRegion(BundleMember));
	if (!BundleMember->hasValidDependencies()) {

	DEBUG(dbgs() << "SLP: update deps of " << *BundleMember << "\n");
	BundleMember->Dependencies = 0;
	BundleMember->resetUnscheduledDeps();

	// Handle def-use chain dependencies.
	for (User *U : BundleMember->Inst->users()) {
	if (isa<Instruction>(U)) {
	ScheduleData *UseSD = getScheduleData(U);
	if (UseSD && isInSchedulingRegion(UseSD->FirstInBundle)) {
	BundleMember->Dependencies++;
	ScheduleData *DestBundle = UseSD->FirstInBundle;
	if (!DestBundle->IsScheduled) {
	BundleMember->incrementUnscheduledDeps(1);
	}
	if (!DestBundle->hasValidDependencies()) {
	WorkList.push_back(DestBundle);
	}
	}
	} else {
	// I'm not sure if this can ever happen. But we need to be safe.
	// This lets the instruction/bundle never be scheduled and
	// eventually disable vectorization.
	BundleMember->Dependencies++;
	BundleMember->incrementUnscheduledDeps(1);
	}
	}

	// Handle the memory dependencies.
	ScheduleData *DepDest = BundleMember->NextLoadStore;
	if (DepDest) {
	Instruction *SrcInst = BundleMember->Inst;
	MemoryLocation SrcLoc = getLocation(SrcInst, SLP->AA);
	bool SrcMayWrite = BundleMember->Inst->mayWriteToMemory();
	unsigned numAliased = 0;
	unsigned DistToSrc = 1;

	while (DepDest) {
	assert(isInSchedulingRegion(DepDest));

	// We have two limits to reduce the complexity:
	// 1) AliasedCheckLimit: It's a small limit to reduce calls to
	// SLP->isAliased (which is the expensive part in this loop).
	// 2) MaxMemDepDistance: It's for very large blocks and it aborts
	// the whole loop (even if the loop is fast, it's quadratic).
	// It's important for the loop break condition (see below) to
	// check this limit even between two read-only instructions.
	if (DistToSrc >= MaxMemDepDistance \|\|
	((SrcMayWrite \|\| DepDest->Inst->mayWriteToMemory()) &&
	(numAliased >= AliasedCheckLimit \|\|
	SLP->isAliased(SrcLoc, SrcInst, DepDest->Inst)))) {

	// We increment the counter only if the locations are aliased
	// (instead of counting all alias checks). This gives a better
	// balance between reduced runtime and accurate dependencies.
	numAliased++;

	DepDest->MemoryDependencies.push_back(BundleMember);
	BundleMember->Dependencies++;
	ScheduleData *DestBundle = DepDest->FirstInBundle;
	if (!DestBundle->IsScheduled) {
	BundleMember->incrementUnscheduledDeps(1);
	}
	if (!DestBundle->hasValidDependencies()) {
	WorkList.push_back(DestBundle);
	}
	}
	DepDest = DepDest->NextLoadStore;

	// Example, explaining the loop break condition: Let's assume our
	// starting instruction is i0 and MaxMemDepDistance = 3.
	//
	// +--------v--v--v
	// i0,i1,i2,i3,i4,i5,i6,i7,i8
	// +--------^--^--^
	//
	// MaxMemDepDistance let us stop alias-checking at i3 and we add
	// dependencies from i0 to i3,i4,.. (even if they are not aliased).
	// Previously we already added dependencies from i3 to i6,i7,i8
	// (because of MaxMemDepDistance). As we added a dependency from
	// i0 to i3, we have transitive dependencies from i0 to i6,i7,i8
	// and we can abort this loop at i6.
	if (DistToSrc >= 2 * MaxMemDepDistance)
	break;
	DistToSrc++;
	}
	}
	}
	BundleMember = BundleMember->NextInBundle;
	}
	if (InsertInReadyList && SD->isReady()) {
	ReadyInsts.push_back(SD);
	DEBUG(dbgs() << "SLP: gets ready on update: " << *SD->Inst << "\n");
	}
	}
	}

	void BoUpSLP::BlockScheduling::resetSchedule() {
	assert(ScheduleStart &&
	"tried to reset schedule on block which has not been scheduled");
	for (Instruction *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
	ScheduleData *SD = getScheduleData(I);
	assert(isInSchedulingRegion(SD));
	SD->IsScheduled = false;
	SD->resetUnscheduledDeps();
	}
	ReadyInsts.clear();
	}

	void BoUpSLP::scheduleBlock(BlockScheduling *BS) {

	if (!BS->ScheduleStart)
	return;

	DEBUG(dbgs() << "SLP: schedule block " << BS->BB->getName() << "\n");

	BS->resetSchedule();

	// For the real scheduling we use a more sophisticated ready-list: it is
	// sorted by the original instruction location. This lets the final schedule
	// be as close as possible to the original instruction order.
	struct ScheduleDataCompare {
	bool operator()(ScheduleData SD1, ScheduleData SD2) {
	return SD2->SchedulingPriority < SD1->SchedulingPriority;
	}
	};
	std::set<ScheduleData *, ScheduleDataCompare> ReadyInsts;

	// Ensure that all dependency data is updated and fill the ready-list with
	// initial instructions.
	int Idx = 0;
	int NumToSchedule = 0;
	for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd;
	I = I->getNextNode()) {
	ScheduleData *SD = BS->getScheduleData(I);
	assert(
	SD->isPartOfBundle() == (ScalarToTreeEntry.count(SD->Inst) != 0) &&
	"scheduler and vectorizer have different opinion on what is a bundle");
	SD->FirstInBundle->SchedulingPriority = Idx++;
	if (SD->isSchedulingEntity()) {
	BS->calculateDependencies(SD, false, this);
	NumToSchedule++;
	}
	}
	BS->initialFillReadyList(ReadyInsts);

	Instruction *LastScheduledInst = BS->ScheduleEnd;

	// Do the "real" scheduling.
	while (!ReadyInsts.empty()) {
	ScheduleData picked = ReadyInsts.begin();
	ReadyInsts.erase(ReadyInsts.begin());

	// Move the scheduled instruction(s) to their dedicated places, if not
	// there yet.
	ScheduleData *BundleMember = picked;
	while (BundleMember) {
	Instruction *pickedInst = BundleMember->Inst;
	if (LastScheduledInst->getNextNode() != pickedInst) {
	BS->BB->getInstList().remove(pickedInst);
	BS->BB->getInstList().insert(LastScheduledInst->getIterator(),
	pickedInst);
	}
	LastScheduledInst = pickedInst;
	BundleMember = BundleMember->NextInBundle;
	}

	BS->schedule(picked, ReadyInsts);
	NumToSchedule--;
	}
	assert(NumToSchedule == 0 && "could not schedule all instructions");

	// Avoid duplicate scheduling of the block.
	BS->ScheduleStart = nullptr;
	}

	unsigned BoUpSLP::getVectorElementSize(Value *V) {
	// If V is a store, just return the width of the stored value without
	// traversing the expression tree. This is the common case.
	if (auto *Store = dyn_cast<StoreInst>(V))
	return DL->getTypeSizeInBits(Store->getValueOperand()->getType());

	// If V is not a store, we can traverse the expression tree to find loads
	// that feed it. The type of the loaded value may indicate a more suitable
	// width than V's type. We want to base the vector element size on the width
	// of memory operations where possible.
	SmallVector<Instruction *, 16> Worklist;
	SmallPtrSet<Instruction *, 16> Visited;
	if (auto *I = dyn_cast<Instruction>(V))
	Worklist.push_back(I);

	// Traverse the expression tree in bottom-up order looking for loads. If we
	// encounter an instruciton we don't yet handle, we give up.
	auto MaxWidth = 0u;
	auto FoundUnknownInst = false;
	while (!Worklist.empty() && !FoundUnknownInst) {
	auto *I = Worklist.pop_back_val();
	Visited.insert(I);

	// We should only be looking at scalar instructions here. If the current
	// instruction has a vector type, give up.
	auto *Ty = I->getType();
	if (isa<VectorType>(Ty))
	FoundUnknownInst = true;

	// If the current instruction is a load, update MaxWidth to reflect the
	// width of the loaded value.
	else if (isa<LoadInst>(I))
	MaxWidth = std::max<unsigned>(MaxWidth, DL->getTypeSizeInBits(Ty));

	// Otherwise, we need to visit the operands of the instruction. We only
	// handle the interesting cases from buildTree here. If an operand is an
	// instruction we haven't yet visited, we add it to the worklist.
	else if (isa<PHINode>(I) \|\| isa<CastInst>(I) \|\| isa<GetElementPtrInst>(I) \|\|
	isa<CmpInst>(I) \|\| isa<SelectInst>(I) \|\| isa<BinaryOperator>(I)) {
	for (Use &U : I->operands())
	if (auto *J = dyn_cast<Instruction>(U.get()))
	if (!Visited.count(J))
	Worklist.push_back(J);
	}

	// If we don't yet handle the instruction, give up.
	else
	FoundUnknownInst = true;
	}

	// If we didn't encounter a memory access in the expression tree, or if we
	// gave up for some reason, just return the width of V.
	if (!MaxWidth \|\| FoundUnknownInst)
	return DL->getTypeSizeInBits(V->getType());

	// Otherwise, return the maximum width we found.
	return MaxWidth;
	}

	// Determine if a value V in a vectorizable expression Expr can be demoted to a
	// smaller type with a truncation. We collect the values that will be demoted
	// in ToDemote and additional roots that require investigating in Roots.
	static bool collectValuesToDemote(Value V, SmallPtrSetImpl<Value > &Expr,
	SmallVectorImpl<Value *> &ToDemote,
	SmallVectorImpl<Value *> &Roots) {

	// We can always demote constants.
	if (isa<Constant>(V)) {
	ToDemote.push_back(V);
	return true;
	}

	// If the value is not an instruction in the expression with only one use, it
	// cannot be demoted.
	auto *I = dyn_cast<Instruction>(V);
	if (!I \|\| !I->hasOneUse() \|\| !Expr.count(I))
	return false;

	switch (I->getOpcode()) {

	// We can always demote truncations and extensions. Since truncations can
	// seed additional demotion, we save the truncated value.
	case Instruction::Trunc:
	Roots.push_back(I->getOperand(0));
	case Instruction::ZExt:
	case Instruction::SExt:
	break;

	// We can demote certain binary operations if we can demote both of their
	// operands.
	case Instruction::Add:
	case Instruction::Sub:
	case Instruction::Mul:
	case Instruction::And:
	case Instruction::Or:
	case Instruction::Xor:
	if (!collectValuesToDemote(I->getOperand(0), Expr, ToDemote, Roots) \|\|
	!collectValuesToDemote(I->getOperand(1), Expr, ToDemote, Roots))
	return false;
	break;

	// We can demote selects if we can demote their true and false values.
	case Instruction::Select: {
	SelectInst *SI = cast<SelectInst>(I);
	if (!collectValuesToDemote(SI->getTrueValue(), Expr, ToDemote, Roots) \|\|
	!collectValuesToDemote(SI->getFalseValue(), Expr, ToDemote, Roots))
	return false;
	break;
	}

	// We can demote phis if we can demote all their incoming operands. Note that
	// we don't need to worry about cycles since we ensure single use above.
	case Instruction::PHI: {
	PHINode *PN = cast<PHINode>(I);
	for (Value *IncValue : PN->incoming_values())
	if (!collectValuesToDemote(IncValue, Expr, ToDemote, Roots))
	return false;
	break;
	}

	// Otherwise, conservatively give up.
	default:
	return false;
	}

	// Record the value that we can demote.
	ToDemote.push_back(V);
	return true;
	}

	void BoUpSLP::computeMinimumValueSizes() {
	// If there are no external uses, the expression tree must be rooted by a
	// store. We can't demote in-memory values, so there is nothing to do here.
	if (ExternalUses.empty())
	return;

	// We only attempt to truncate integer expressions.
	auto &TreeRoot = VectorizableTree[0].Scalars;
	auto *TreeRootIT = dyn_cast<IntegerType>(TreeRoot[0]->getType());
	if (!TreeRootIT)
	return;

	// If the expression is not rooted by a store, these roots should have
	// external uses. We will rely on InstCombine to rewrite the expression in
	// the narrower type. However, InstCombine only rewrites single-use values.
	// This means that if a tree entry other than a root is used externally, it
	// must have multiple uses and InstCombine will not rewrite it. The code
	// below ensures that only the roots are used externally.
	SmallPtrSet<Value *, 32> Expr(TreeRoot.begin(), TreeRoot.end());
	for (auto &EU : ExternalUses)
	if (!Expr.erase(EU.Scalar))
	return;
	if (!Expr.empty())
	return;

	// Collect the scalar values of the vectorizable expression. We will use this
	// context to determine which values can be demoted. If we see a truncation,
	// we mark it as seeding another demotion.
	for (auto &Entry : VectorizableTree)
	Expr.insert(Entry.Scalars.begin(), Entry.Scalars.end());

	// Ensure the roots of the vectorizable tree don't form a cycle. They must
	// have a single external user that is not in the vectorizable tree.
	for (auto *Root : TreeRoot)
	if (!Root->hasOneUse() \|\| Expr.count(*Root->user_begin()))
	return;

	// Conservatively determine if we can actually truncate the roots of the
	// expression. Collect the values that can be demoted in ToDemote and
	// additional roots that require investigating in Roots.
	SmallVector<Value *, 32> ToDemote;
	SmallVector<Value *, 4> Roots;
	for (auto *Root : TreeRoot)
	if (!collectValuesToDemote(Root, Expr, ToDemote, Roots))
	return;

	// The maximum bit width required to represent all the values that can be
	// demoted without loss of precision. It would be safe to truncate the roots
	// of the expression to this width.
	auto MaxBitWidth = 8u;

	// We first check if all the bits of the roots are demanded. If they're not,
	// we can truncate the roots to this narrower type.
	for (auto *Root : TreeRoot) {
	auto Mask = DB->getDemandedBits(cast<Instruction>(Root));
	MaxBitWidth = std::max<unsigned>(
	Mask.getBitWidth() - Mask.countLeadingZeros(), MaxBitWidth);
	}

	// True if the roots can be zero-extended back to their original type, rather
	// than sign-extended. We know that if the leading bits are not demanded, we
	// can safely zero-extend. So we initialize IsKnownPositive to True.
	bool IsKnownPositive = true;

	// If all the bits of the roots are demanded, we can try a little harder to
	// compute a narrower type. This can happen, for example, if the roots are
	// getelementptr indices. InstCombine promotes these indices to the pointer
	// width. Thus, all their bits are technically demanded even though the
	// address computation might be vectorized in a smaller type.
	//
	// We start by looking at each entry that can be demoted. We compute the
	// maximum bit width required to store the scalar by using ValueTracking to
	// compute the number of high-order bits we can truncate.
	if (MaxBitWidth == DL->getTypeSizeInBits(TreeRoot[0]->getType())) {
	MaxBitWidth = 8u;

	// Determine if the sign bit of all the roots is known to be zero. If not,
	// IsKnownPositive is set to False.
	IsKnownPositive = all_of(TreeRoot, [&](Value *R) {
	bool KnownZero = false;
	bool KnownOne = false;
	ComputeSignBit(R, KnownZero, KnownOne, *DL);
	return KnownZero;
	});

	// Determine the maximum number of bits required to store the scalar
	// values.
	for (auto *Scalar : ToDemote) {
	auto NumSignBits = ComputeNumSignBits(Scalar, *DL, 0, AC, 0, DT);
	auto NumTypeBits = DL->getTypeSizeInBits(Scalar->getType());
	MaxBitWidth = std::max<unsigned>(NumTypeBits - NumSignBits, MaxBitWidth);
	}

	// If we can't prove that the sign bit is zero, we must add one to the
	// maximum bit width to account for the unknown sign bit. This preserves
	// the existing sign bit so we can safely sign-extend the root back to the
	// original type. Otherwise, if we know the sign bit is zero, we will
	// zero-extend the root instead.
	//
	// FIXME: This is somewhat suboptimal, as there will be cases where adding
	// one to the maximum bit width will yield a larger-than-necessary
	// type. In general, we need to add an extra bit only if we can't
	// prove that the upper bit of the original type is equal to the
	// upper bit of the proposed smaller type. If these two bits are the
	// same (either zero or one) we know that sign-extending from the
	// smaller type will result in the same value. Here, since we can't
	// yet prove this, we are just making the proposed smaller type
	// larger to ensure correctness.
	if (!IsKnownPositive)
	++MaxBitWidth;
	}

	// Round MaxBitWidth up to the next power-of-two.
	if (!isPowerOf2_64(MaxBitWidth))
	MaxBitWidth = NextPowerOf2(MaxBitWidth);

	// If the maximum bit width we compute is less than the with of the roots'
	// type, we can proceed with the narrowing. Otherwise, do nothing.
	if (MaxBitWidth >= TreeRootIT->getBitWidth())
	return;

	// If we can truncate the root, we must collect additional values that might
	// be demoted as a result. That is, those seeded by truncations we will
	// modify.
	while (!Roots.empty())
	collectValuesToDemote(Roots.pop_back_val(), Expr, ToDemote, Roots);

	// Finally, map the values we can demote to the maximum bit with we computed.
	for (auto *Scalar : ToDemote)
	MinBWs[Scalar] = std::make_pair(MaxBitWidth, !IsKnownPositive);
	}

	namespace {
	/// The SLPVectorizer Pass.
	struct SLPVectorizer : public FunctionPass {
	SLPVectorizerPass Impl;

	/// Pass identification, replacement for typeid
	static char ID;

	explicit SLPVectorizer() : FunctionPass(ID) {
	initializeSLPVectorizerPass(*PassRegistry::getPassRegistry());
	}


	bool doInitialization(Module &M) override {
	return false;
	}

	bool runOnFunction(Function &F) override {
	if (skipFunction(F))
	return false;

	auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
	auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
	auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
	auto *TLI = TLIP ? &TLIP->getTLI() : nullptr;
	auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
	auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
	auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
	auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
	auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();

	return Impl.runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB);
	}

	void getAnalysisUsage(AnalysisUsage &AU) const override {
	FunctionPass::getAnalysisUsage(AU);
	AU.addRequired<AssumptionCacheTracker>();
	AU.addRequired<ScalarEvolutionWrapperPass>();
	AU.addRequired<AAResultsWrapperPass>();
	AU.addRequired<TargetTransformInfoWrapperPass>();
	AU.addRequired<LoopInfoWrapperPass>();
	AU.addRequired<DominatorTreeWrapperPass>();
	AU.addRequired<DemandedBitsWrapperPass>();
	AU.addPreserved<LoopInfoWrapperPass>();
	AU.addPreserved<DominatorTreeWrapperPass>();
	AU.addPreserved<AAResultsWrapperPass>();
	AU.addPreserved<GlobalsAAWrapperPass>();
	AU.setPreservesCFG();
	}
	};
	} // end anonymous namespace

	PreservedAnalyses SLPVectorizerPass::run(Function &F, FunctionAnalysisManager &AM) {
	auto *SE = &AM.getResult<ScalarEvolutionAnalysis>(F);
	auto *TTI = &AM.getResult<TargetIRAnalysis>(F);
	auto *TLI = AM.getCachedResult<TargetLibraryAnalysis>(F);
	auto *AA = &AM.getResult<AAManager>(F);
	auto *LI = &AM.getResult<LoopAnalysis>(F);
	auto *DT = &AM.getResult<DominatorTreeAnalysis>(F);
	auto *AC = &AM.getResult<AssumptionAnalysis>(F);
	auto *DB = &AM.getResult<DemandedBitsAnalysis>(F);

	bool Changed = runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB);
	if (!Changed)
	return PreservedAnalyses::all();
	PreservedAnalyses PA;
	PA.preserve<LoopAnalysis>();
	PA.preserve<DominatorTreeAnalysis>();
	PA.preserve<AAManager>();
	PA.preserve<GlobalsAA>();
	return PA;
	}

	bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_,
	TargetTransformInfo *TTI_,
	TargetLibraryInfo TLI_, AliasAnalysis AA_,
	LoopInfo LI_, DominatorTree DT_,
	AssumptionCache AC_, DemandedBits DB_) {
	SE = SE_;
	TTI = TTI_;
	TLI = TLI_;
	AA = AA_;
	LI = LI_;
	DT = DT_;
	AC = AC_;
	DB = DB_;
	DL = &F.getParent()->getDataLayout();

	Stores.clear();
	GEPs.clear();
	bool Changed = false;

	// If the target claims to have no vector registers don't attempt
	// vectorization.
	if (!TTI->getNumberOfRegisters(true))
	return false;

	// Don't vectorize when the attribute NoImplicitFloat is used.
	if (F.hasFnAttribute(Attribute::NoImplicitFloat))
	return false;

	DEBUG(dbgs() << "SLP: Analyzing blocks in " << F.getName() << ".\n");

	// Use the bottom up slp vectorizer to construct chains that start with
	// store instructions.
	BoUpSLP R(&F, SE, TTI, TLI, AA, LI, DT, AC, DB, DL);

	// A general note: the vectorizer must use BoUpSLP::eraseInstruction() to
	// delete instructions.

	// Scan the blocks in the function in post order.
	for (auto BB : post_order(&F.getEntryBlock())) {
	collectSeedInstructions(BB);

	// Vectorize trees that end at stores.
	if (!Stores.empty()) {
	DEBUG(dbgs() << "SLP: Found stores for " << Stores.size()
	<< " underlying objects.\n");
	Changed \|= vectorizeStoreChains(R);
	}

	// Vectorize trees that end at reductions.
	Changed \|= vectorizeChainsInBlock(BB, R);

	// Vectorize the index computations of getelementptr instructions. This
	// is primarily intended to catch gather-like idioms ending at
	// non-consecutive loads.
	if (!GEPs.empty()) {
	DEBUG(dbgs() << "SLP: Found GEPs for " << GEPs.size()
	<< " underlying objects.\n");
	Changed \|= vectorizeGEPIndices(BB, R);
	}
	}

	if (Changed) {
	R.optimizeGatherSequence();
	DEBUG(dbgs() << "SLP: vectorized \"" << F.getName() << "\"\n");
	DEBUG(verifyFunction(F));
	}
	return Changed;
	}

	/// \brief Check that the Values in the slice in VL array are still existent in
	/// the WeakVH array.
	/// Vectorization of part of the VL array may cause later values in the VL array
	/// to become invalid. We track when this has happened in the WeakVH array.
	static bool hasValueBeenRAUWed(ArrayRef<Value *> VL, ArrayRef<WeakVH> VH,
	unsigned SliceBegin, unsigned SliceSize) {
	VL = VL.slice(SliceBegin, SliceSize);
	VH = VH.slice(SliceBegin, SliceSize);
	return !std::equal(VL.begin(), VL.end(), VH.begin());
	}

	bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
	unsigned VecRegSize) {
	unsigned ChainLen = Chain.size();
	DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << ChainLen
	<< "\n");
	unsigned Sz = R.getVectorElementSize(Chain[0]);
	unsigned VF = VecRegSize / Sz;

	if (!isPowerOf2_32(Sz) \|\| VF < 2)
	return false;

	// Keep track of values that were deleted by vectorizing in the loop below.
	SmallVector<WeakVH, 8> TrackValues(Chain.begin(), Chain.end());

	bool Changed = false;
	// Look for profitable vectorizable trees at all offsets, starting at zero.
	for (unsigned i = 0, e = ChainLen; i < e; ++i) {
	if (i + VF > e)
	break;

	// Check that a previous iteration of this loop did not delete the Value.
	if (hasValueBeenRAUWed(Chain, TrackValues, i, VF))
	continue;

	DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << i
	<< "\n");
	ArrayRef<Value *> Operands = Chain.slice(i, VF);

	R.buildTree(Operands);
	if (R.isTreeTinyAndNotFullyVectorizable())
	continue;

	R.computeMinimumValueSizes();

	int Cost = R.getTreeCost();

	DEBUG(dbgs() << "SLP: Found cost=" << Cost << " for VF=" << VF << "\n");
	if (Cost < -SLPCostThreshold) {
	DEBUG(dbgs() << "SLP: Decided to vectorize cost=" << Cost << "\n");
	R.vectorizeTree();

	// Move to the next bundle.
	i += VF - 1;
	Changed = true;
	}
	}

	return Changed;
	}

	bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores,
	BoUpSLP &R) {
	SetVector<StoreInst *> Heads, Tails;
	SmallDenseMap<StoreInst , StoreInst > ConsecutiveChain;

	// We may run into multiple chains that merge into a single chain. We mark the
	// stores that we vectorized so that we don't visit the same store twice.
	BoUpSLP::ValueSet VectorizedStores;
	bool Changed = false;

	// Do a quadratic search on all of the given stores and find
	// all of the pairs of stores that follow each other.
	SmallVector<unsigned, 16> IndexQueue;
	for (unsigned i = 0, e = Stores.size(); i < e; ++i) {
	IndexQueue.clear();
	// If a store has multiple consecutive store candidates, search Stores
	// array according to the sequence: from i+1 to e, then from i-1 to 0.
	// This is because usually pairing with immediate succeeding or preceding
	// candidate create the best chance to find slp vectorization opportunity.
	unsigned j = 0;
	for (j = i + 1; j < e; ++j)
	IndexQueue.push_back(j);
	for (j = i; j > 0; --j)
	IndexQueue.push_back(j - 1);

	for (auto &k : IndexQueue) {
	if (isConsecutiveAccess(Stores[i], Stores[k], DL, SE)) {
	Tails.insert(Stores[k]);
	Heads.insert(Stores[i]);
	ConsecutiveChain[Stores[i]] = Stores[k];
	break;
	}
	}
	}

	// For stores that start but don't end a link in the chain:
	for (SetVector<StoreInst *>::iterator it = Heads.begin(), e = Heads.end();
	it != e; ++it) {
	if (Tails.count(*it))
	continue;

	// We found a store instr that starts a chain. Now follow the chain and try
	// to vectorize it.
	BoUpSLP::ValueList Operands;
	StoreInst I = it;
	// Collect the chain into a list.
	while (Tails.count(I) \|\| Heads.count(I)) {
	if (VectorizedStores.count(I))
	break;
	Operands.push_back(I);
	// Move to the next value in the chain.
	I = ConsecutiveChain[I];
	}

	// FIXME: Is division-by-2 the correct step? Should we assert that the
	// register size is a power-of-2?
	for (unsigned Size = R.getMaxVecRegSize(); Size >= R.getMinVecRegSize();
	Size /= 2) {
	if (vectorizeStoreChain(Operands, R, Size)) {
	// Mark the vectorized stores so that we don't vectorize them again.
	VectorizedStores.insert(Operands.begin(), Operands.end());
	Changed = true;
	break;
	}
	}
	}

	return Changed;
	}

	void SLPVectorizerPass::collectSeedInstructions(BasicBlock *BB) {

	// Initialize the collections. We will make a single pass over the block.
	Stores.clear();
	GEPs.clear();

	// Visit the store and getelementptr instructions in BB and organize them in
	// Stores and GEPs according to the underlying objects of their pointer
	// operands.
	for (Instruction &I : *BB) {

	// Ignore store instructions that are volatile or have a pointer operand
	// that doesn't point to a scalar type.
	if (auto *SI = dyn_cast<StoreInst>(&I)) {
	if (!SI->isSimple())
	continue;
	if (!isValidElementType(SI->getValueOperand()->getType()))
	continue;
	Stores[GetUnderlyingObject(SI->getPointerOperand(), *DL)].push_back(SI);
	}

	// Ignore getelementptr instructions that have more than one index, a
	// constant index, or a pointer operand that doesn't point to a scalar
	// type.
	else if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {
	auto Idx = GEP->idx_begin()->get();
	if (GEP->getNumIndices() > 1 \|\| isa<Constant>(Idx))
	continue;
	if (!isValidElementType(Idx->getType()))
	continue;
	if (GEP->getType()->isVectorTy())
	continue;
	GEPs[GetUnderlyingObject(GEP->getPointerOperand(), *DL)].push_back(GEP);
	}
	}
	}

	bool SLPVectorizerPass::tryToVectorizePair(Value A, Value B, BoUpSLP &R) {
	if (!A \|\| !B)
	return false;
	Value *VL[] = { A, B };
	return tryToVectorizeList(VL, R, None, true);
	}

	bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
	ArrayRef<Value *> BuildVector,
	bool AllowReorder) {
	if (VL.size() < 2)
	return false;

	DEBUG(dbgs() << "SLP: Trying to vectorize a list of length = " << VL.size()
	<< ".\n");

	// Check that all of the parts are scalar instructions of the same type.
	Instruction *I0 = dyn_cast<Instruction>(VL[0]);
	if (!I0)
	return false;

	unsigned Opcode0 = I0->getOpcode();

	unsigned Sz = R.getVectorElementSize(I0);
	unsigned MinVF = std::max(2U, R.getMinVecRegSize() / Sz);
	unsigned MaxVF = std::max<unsigned>(PowerOf2Floor(VL.size()), MinVF);
	if (MaxVF < 2)
	return false;

	for (Value *V : VL) {
	Type *Ty = V->getType();
	if (!isValidElementType(Ty))
	return false;
	Instruction *Inst = dyn_cast<Instruction>(V);
	if (!Inst \|\| Inst->getOpcode() != Opcode0)
	return false;
	}

	bool Changed = false;

	// Keep track of values that were deleted by vectorizing in the loop below.
	SmallVector<WeakVH, 8> TrackValues(VL.begin(), VL.end());

	unsigned NextInst = 0, MaxInst = VL.size();
	for (unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF;
	VF /= 2) {
	// No actual vectorization should happen, if number of parts is the same as
	// provided vectorization factor (i.e. the scalar type is used for vector
	// code during codegen).
	auto *VecTy = VectorType::get(VL[0]->getType(), VF);
	if (TTI->getNumberOfParts(VecTy) == VF)
	continue;
	for (unsigned I = NextInst; I < MaxInst; ++I) {
	unsigned OpsWidth = 0;

	if (I + VF > MaxInst)
	OpsWidth = MaxInst - I;
	else
	OpsWidth = VF;

	if (!isPowerOf2_32(OpsWidth) \|\| OpsWidth < 2)
	break;

	// Check that a previous iteration of this loop did not delete the Value.
	if (hasValueBeenRAUWed(VL, TrackValues, I, OpsWidth))
	continue;

	DEBUG(dbgs() << "SLP: Analyzing " << OpsWidth << " operations "
	<< "\n");
	ArrayRef<Value *> Ops = VL.slice(I, OpsWidth);

	ArrayRef<Value *> BuildVectorSlice;
	if (!BuildVector.empty())
	BuildVectorSlice = BuildVector.slice(I, OpsWidth);

	R.buildTree(Ops, BuildVectorSlice);
	// TODO: check if we can allow reordering for more cases.
	if (AllowReorder && R.shouldReorder()) {
	// Conceptually, there is nothing actually preventing us from trying to
	// reorder a larger list. In fact, we do exactly this when vectorizing
	// reductions. However, at this point, we only expect to get here from
	// tryToVectorizePair().
	assert(Ops.size() == 2);
	assert(BuildVectorSlice.empty());
	Value *ReorderedOps[] = {Ops[1], Ops[0]};
	R.buildTree(ReorderedOps, None);
	}
	if (R.isTreeTinyAndNotFullyVectorizable())
	continue;

	R.computeMinimumValueSizes();
	int Cost = R.getTreeCost();

	if (Cost < -SLPCostThreshold) {
	DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n");
	Value *VectorizedRoot = R.vectorizeTree();

	// Reconstruct the build vector by extracting the vectorized root. This
	// way we handle the case where some elements of the vector are
	// undefined.
	// (return (inserelt <4 xi32> (insertelt undef (opd0) 0) (opd1) 2))
	if (!BuildVectorSlice.empty()) {
	// The insert point is the last build vector instruction. The
	// vectorized root will precede it. This guarantees that we get an
	// instruction. The vectorized tree could have been constant folded.
	Instruction *InsertAfter = cast<Instruction>(BuildVectorSlice.back());
	unsigned VecIdx = 0;
	for (auto &V : BuildVectorSlice) {
	IRBuilder<NoFolder> Builder(InsertAfter->getParent(),
	++BasicBlock::iterator(InsertAfter));
	Instruction *I = cast<Instruction>(V);
	assert(isa<InsertElementInst>(I) \|\| isa<InsertValueInst>(I));
	Instruction *Extract =
	cast<Instruction>(Builder.CreateExtractElement(
	VectorizedRoot, Builder.getInt32(VecIdx++)));
	I->setOperand(1, Extract);
	I->removeFromParent();
	I->insertAfter(Extract);
	InsertAfter = I;
	}
	}
	// Move to the next bundle.
	I += VF - 1;
	NextInst = I + 1;
	Changed = true;
	}
	}
	}

	return Changed;
	}

	bool SLPVectorizerPass::tryToVectorize(BinaryOperator *V, BoUpSLP &R) {
	if (!V)
	return false;

	- Value *P = V->getParent();
	-
	- // Vectorize in current basic block only.
	- auto *Op0 = dyn_cast<Instruction>(V->getOperand(0));
	- auto *Op1 = dyn_cast<Instruction>(V->getOperand(1));
	- if (!Op0 \|\| !Op1 \|\| Op0->getParent() != P \|\| Op1->getParent() != P)
	- return false;
	-
	// Try to vectorize V.
	- if (tryToVectorizePair(Op0, Op1, R))
	+ if (tryToVectorizePair(V->getOperand(0), V->getOperand(1), R))
	return true;

	- auto *A = dyn_cast<BinaryOperator>(Op0);
	- auto *B = dyn_cast<BinaryOperator>(Op1);
	+ BinaryOperator *A = dyn_cast<BinaryOperator>(V->getOperand(0));
	+ BinaryOperator *B = dyn_cast<BinaryOperator>(V->getOperand(1));
	// Try to skip B.
	if (B && B->hasOneUse()) {
	- auto *B0 = dyn_cast<BinaryOperator>(B->getOperand(0));
	- auto *B1 = dyn_cast<BinaryOperator>(B->getOperand(1));
	- if (B0 && B0->getParent() == P && tryToVectorizePair(A, B0, R))
	+ BinaryOperator *B0 = dyn_cast<BinaryOperator>(B->getOperand(0));
	+ BinaryOperator *B1 = dyn_cast<BinaryOperator>(B->getOperand(1));
	+ if (tryToVectorizePair(A, B0, R)) {
	return true;
	- if (B1 && B1->getParent() == P && tryToVectorizePair(A, B1, R))
	+ }
	+ if (tryToVectorizePair(A, B1, R)) {
	return true;
	+ }
	}

	// Try to skip A.
	if (A && A->hasOneUse()) {
	- auto *A0 = dyn_cast<BinaryOperator>(A->getOperand(0));
	- auto *A1 = dyn_cast<BinaryOperator>(A->getOperand(1));
	- if (A0 && A0->getParent() == P && tryToVectorizePair(A0, B, R))
	+ BinaryOperator *A0 = dyn_cast<BinaryOperator>(A->getOperand(0));
	+ BinaryOperator *A1 = dyn_cast<BinaryOperator>(A->getOperand(1));
	+ if (tryToVectorizePair(A0, B, R)) {
	return true;
	- if (A1 && A1->getParent() == P && tryToVectorizePair(A1, B, R))
	+ }
	+ if (tryToVectorizePair(A1, B, R)) {
	return true;
	+ }
	}
	- return false;
	+ return 0;
	}

	/// \brief Generate a shuffle mask to be used in a reduction tree.
	///
	/// \param VecLen The length of the vector to be reduced.
	/// \param NumEltsToRdx The number of elements that should be reduced in the
	/// vector.
	/// \param IsPairwise Whether the reduction is a pairwise or splitting
	/// reduction. A pairwise reduction will generate a mask of
	/// <0,2,...> or <1,3,..> while a splitting reduction will generate
	/// <2,3, undef,undef> for a vector of 4 and NumElts = 2.
	/// \param IsLeft True will generate a mask of even elements, odd otherwise.
	static Value *createRdxShuffleMask(unsigned VecLen, unsigned NumEltsToRdx,
	bool IsPairwise, bool IsLeft,
	IRBuilder<> &Builder) {
	assert((IsPairwise \|\| !IsLeft) && "Don't support a <0,1,undef,...> mask");

	SmallVector<Constant *, 32> ShuffleMask(
	VecLen, UndefValue::get(Builder.getInt32Ty()));

	if (IsPairwise)
	// Build a mask of 0, 2, ... (left) or 1, 3, ... (right).
	for (unsigned i = 0; i != NumEltsToRdx; ++i)
	ShuffleMask[i] = Builder.getInt32(2 * i + !IsLeft);
	else
	// Move the upper half of the vector to the lower half.
	for (unsigned i = 0; i != NumEltsToRdx; ++i)
	ShuffleMask[i] = Builder.getInt32(NumEltsToRdx + i);

	return ConstantVector::get(ShuffleMask);
	}

	namespace {
	/// Model horizontal reductions.
	///
	/// A horizontal reduction is a tree of reduction operations (currently add and
	/// fadd) that has operations that can be put into a vector as its leaf.
	/// For example, this tree:
	///
	/// mul mul mul mul
	/// \ / \ /
	/// + +
	/// \ /
	/// +
	/// This tree has "mul" as its reduced values and "+" as its reduction
	/// operations. A reduction might be feeding into a store or a binary operation
	/// feeding a phi.
	/// ...
	/// \ /
	/// +
	/// \|
	/// phi +=
	///
	/// Or:
	/// ...
	/// \ /
	/// +
	/// \|
	/// *p =
	///
	class HorizontalReduction {
	SmallVector<Value *, 16> ReductionOps;
	SmallVector<Value *, 32> ReducedVals;

	BinaryOperator *ReductionRoot;
	// After successfull horizontal reduction vectorization attempt for PHI node
	// vectorizer tries to update root binary op by combining vectorized tree and
	// the ReductionPHI node. But during vectorization this ReductionPHI can be
	// vectorized itself and replaced by the undef value, while the instruction
	// itself is marked for deletion. This 'marked for deletion' PHI node then can
	// be used in new binary operation, causing "Use still stuck around after Def
	// is destroyed" crash upon PHI node deletion.
	WeakVH ReductionPHI;

	/// The opcode of the reduction.
	unsigned ReductionOpcode;
	/// The opcode of the values we perform a reduction on.
	unsigned ReducedValueOpcode;
	/// Should we model this reduction as a pairwise reduction tree or a tree that
	/// splits the vector in halves and adds those halves.
	bool IsPairwiseReduction;

	public:
	/// The width of one full horizontal reduction operation.
	unsigned ReduxWidth;

	/// Minimal width of available vector registers. It's used to determine
	/// ReduxWidth.
	unsigned MinVecRegSize;

	HorizontalReduction(unsigned MinVecRegSize)
	: ReductionRoot(nullptr), ReductionOpcode(0), ReducedValueOpcode(0),
	IsPairwiseReduction(false), ReduxWidth(0),
	MinVecRegSize(MinVecRegSize) {}

	/// \brief Try to find a reduction tree.
	bool matchAssociativeReduction(PHINode Phi, BinaryOperator B) {
	assert((!Phi \|\| is_contained(Phi->operands(), B)) &&
	"Thi phi needs to use the binary operator");

	// We could have a initial reductions that is not an add.
	// r *= v1 + v2 + v3 + v4
	// In such a case start looking for a tree rooted in the first '+'.
	if (Phi) {
	if (B->getOperand(0) == Phi) {
	Phi = nullptr;
	B = dyn_cast<BinaryOperator>(B->getOperand(1));
	} else if (B->getOperand(1) == Phi) {
	Phi = nullptr;
	B = dyn_cast<BinaryOperator>(B->getOperand(0));
	}
	}

	if (!B)
	return false;

	Type *Ty = B->getType();
	if (!isValidElementType(Ty))
	return false;

	const DataLayout &DL = B->getModule()->getDataLayout();
	ReductionOpcode = B->getOpcode();
	ReducedValueOpcode = 0;
	// FIXME: Register size should be a parameter to this function, so we can
	// try different vectorization factors.
	ReduxWidth = MinVecRegSize / DL.getTypeSizeInBits(Ty);
	ReductionRoot = B;
	ReductionPHI = Phi;

	if (ReduxWidth < 4)
	return false;

	// We currently only support adds.
	if (ReductionOpcode != Instruction::Add &&
	ReductionOpcode != Instruction::FAdd)
	return false;

	// Post order traverse the reduction tree starting at B. We only handle true
	// trees containing only binary operators or selects.
	SmallVector<std::pair<Instruction *, unsigned>, 32> Stack;
	Stack.push_back(std::make_pair(B, 0));
	while (!Stack.empty()) {
	Instruction *TreeN = Stack.back().first;
	unsigned EdgeToVist = Stack.back().second++;
	bool IsReducedValue = TreeN->getOpcode() != ReductionOpcode;

	// Only handle trees in the current basic block.
	if (TreeN->getParent() != B->getParent())
	return false;

	// Each tree node needs to have one user except for the ultimate
	// reduction.
	if (!TreeN->hasOneUse() && TreeN != B)
	return false;

	// Postorder vist.
	if (EdgeToVist == 2 \|\| IsReducedValue) {
	if (IsReducedValue) {
	// Make sure that the opcodes of the operations that we are going to
	// reduce match.
	if (!ReducedValueOpcode)
	ReducedValueOpcode = TreeN->getOpcode();
	else if (ReducedValueOpcode != TreeN->getOpcode())
	return false;
	ReducedVals.push_back(TreeN);
	} else {
	// We need to be able to reassociate the adds.
	if (!TreeN->isAssociative())
	return false;
	ReductionOps.push_back(TreeN);
	}
	// Retract.
	Stack.pop_back();
	continue;
	}

	// Visit left or right.
	Value *NextV = TreeN->getOperand(EdgeToVist);
	if (NextV != Phi) {
	auto *I = dyn_cast<Instruction>(NextV);
	// Continue analysis if the next operand is a reduction operation or
	// (possibly) a reduced value. If the reduced value opcode is not set,
	// the first met operation != reduction operation is considered as the
	// reduced value class.
	if (I && (!ReducedValueOpcode \|\| I->getOpcode() == ReducedValueOpcode \|\|
	I->getOpcode() == ReductionOpcode)) {
	if (!ReducedValueOpcode && I->getOpcode() != ReductionOpcode)
	ReducedValueOpcode = I->getOpcode();
	Stack.push_back(std::make_pair(I, 0));
	continue;
	}
	return false;
	}
	}
	return true;
	}

	/// \brief Attempt to vectorize the tree found by
	/// matchAssociativeReduction.
	bool tryToReduce(BoUpSLP &V, TargetTransformInfo *TTI) {
	if (ReducedVals.empty())
	return false;

	unsigned NumReducedVals = ReducedVals.size();
	if (NumReducedVals < ReduxWidth)
	return false;

	Value *VectorizedTree = nullptr;
	IRBuilder<> Builder(ReductionRoot);
	FastMathFlags Unsafe;
	Unsafe.setUnsafeAlgebra();
	Builder.setFastMathFlags(Unsafe);
	unsigned i = 0;

	for (; i < NumReducedVals - ReduxWidth + 1; i += ReduxWidth) {
	auto VL = makeArrayRef(&ReducedVals[i], ReduxWidth);
	V.buildTree(VL, ReductionOps);
	if (V.shouldReorder()) {
	SmallVector<Value *, 8> Reversed(VL.rbegin(), VL.rend());
	V.buildTree(Reversed, ReductionOps);
	}
	if (V.isTreeTinyAndNotFullyVectorizable())
	continue;

	V.computeMinimumValueSizes();

	// Estimate cost.
	int Cost = V.getTreeCost() + getReductionCost(TTI, ReducedVals[i]);
	if (Cost >= -SLPCostThreshold)
	break;

	DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:" << Cost
	<< ". (HorRdx)\n");

	// Vectorize a tree.
	DebugLoc Loc = cast<Instruction>(ReducedVals[i])->getDebugLoc();
	Value *VectorizedRoot = V.vectorizeTree();

	// Emit a reduction.
	Value *ReducedSubTree = emitReduction(VectorizedRoot, Builder);
	if (VectorizedTree) {
	Builder.SetCurrentDebugLocation(Loc);
	VectorizedTree = createBinOp(Builder, ReductionOpcode, VectorizedTree,
	ReducedSubTree, "bin.rdx");
	} else
	VectorizedTree = ReducedSubTree;
	}

	if (VectorizedTree) {
	// Finish the reduction.
	for (; i < NumReducedVals; ++i) {
	Builder.SetCurrentDebugLocation(
	cast<Instruction>(ReducedVals[i])->getDebugLoc());
	VectorizedTree = createBinOp(Builder, ReductionOpcode, VectorizedTree,
	ReducedVals[i]);
	}
	// Update users.
	if (ReductionPHI && !isa<UndefValue>(ReductionPHI)) {
	assert(ReductionRoot && "Need a reduction operation");
	ReductionRoot->setOperand(0, VectorizedTree);
	ReductionRoot->setOperand(1, ReductionPHI);
	} else
	ReductionRoot->replaceAllUsesWith(VectorizedTree);
	}
	return VectorizedTree != nullptr;
	}

	unsigned numReductionValues() const {
	return ReducedVals.size();
	}

	private:
	/// \brief Calculate the cost of a reduction.
	int getReductionCost(TargetTransformInfo TTI, Value FirstReducedVal) {
	Type *ScalarTy = FirstReducedVal->getType();
	Type *VecTy = VectorType::get(ScalarTy, ReduxWidth);

	int PairwiseRdxCost = TTI->getReductionCost(ReductionOpcode, VecTy, true);
	int SplittingRdxCost = TTI->getReductionCost(ReductionOpcode, VecTy, false);

	IsPairwiseReduction = PairwiseRdxCost < SplittingRdxCost;
	int VecReduxCost = IsPairwiseReduction ? PairwiseRdxCost : SplittingRdxCost;

	int ScalarReduxCost =
	(ReduxWidth - 1) *
	TTI->getArithmeticInstrCost(ReductionOpcode, ScalarTy);

	DEBUG(dbgs() << "SLP: Adding cost " << VecReduxCost - ScalarReduxCost
	<< " for reduction that starts with " << *FirstReducedVal
	<< " (It is a "
	<< (IsPairwiseReduction ? "pairwise" : "splitting")
	<< " reduction)\n");

	return VecReduxCost - ScalarReduxCost;
	}

	static Value createBinOp(IRBuilder<> &Builder, unsigned Opcode, Value L,
	Value *R, const Twine &Name = "") {
	if (Opcode == Instruction::FAdd)
	return Builder.CreateFAdd(L, R, Name);
	return Builder.CreateBinOp((Instruction::BinaryOps)Opcode, L, R, Name);
	}

	/// \brief Emit a horizontal reduction of the vectorized value.
	Value emitReduction(Value VectorizedValue, IRBuilder<> &Builder) {
	assert(VectorizedValue && "Need to have a vectorized tree node");
	assert(isPowerOf2_32(ReduxWidth) &&
	"We only handle power-of-two reductions for now");

	Value *TmpVec = VectorizedValue;
	for (unsigned i = ReduxWidth / 2; i != 0; i >>= 1) {
	if (IsPairwiseReduction) {
	Value *LeftMask =
	createRdxShuffleMask(ReduxWidth, i, true, true, Builder);
	Value *RightMask =
	createRdxShuffleMask(ReduxWidth, i, true, false, Builder);

	Value *LeftShuf = Builder.CreateShuffleVector(
	TmpVec, UndefValue::get(TmpVec->getType()), LeftMask, "rdx.shuf.l");
	Value *RightShuf = Builder.CreateShuffleVector(
	TmpVec, UndefValue::get(TmpVec->getType()), (RightMask),
	"rdx.shuf.r");
	TmpVec = createBinOp(Builder, ReductionOpcode, LeftShuf, RightShuf,
	"bin.rdx");
	} else {
	Value *UpperHalf =
	createRdxShuffleMask(ReduxWidth, i, false, false, Builder);
	Value *Shuf = Builder.CreateShuffleVector(
	TmpVec, UndefValue::get(TmpVec->getType()), UpperHalf, "rdx.shuf");
	TmpVec = createBinOp(Builder, ReductionOpcode, TmpVec, Shuf, "bin.rdx");
	}
	}

	// The result is in the first element of the vector.
	return Builder.CreateExtractElement(TmpVec, Builder.getInt32(0));
	}
	};
	} // end anonymous namespace

	/// \brief Recognize construction of vectors like
	/// %ra = insertelement <4 x float> undef, float %s0, i32 0
	/// %rb = insertelement <4 x float> %ra, float %s1, i32 1
	/// %rc = insertelement <4 x float> %rb, float %s2, i32 2
	/// %rd = insertelement <4 x float> %rc, float %s3, i32 3
	///
	/// Returns true if it matches
	///
	static bool findBuildVector(InsertElementInst *FirstInsertElem,
	SmallVectorImpl<Value *> &BuildVector,
	SmallVectorImpl<Value *> &BuildVectorOpds) {
	if (!isa<UndefValue>(FirstInsertElem->getOperand(0)))
	return false;

	InsertElementInst *IE = FirstInsertElem;
	while (true) {
	BuildVector.push_back(IE);
	BuildVectorOpds.push_back(IE->getOperand(1));

	if (IE->use_empty())
	return false;

	InsertElementInst *NextUse = dyn_cast<InsertElementInst>(IE->user_back());
	if (!NextUse)
	return true;

	// If this isn't the final use, make sure the next insertelement is the only
	// use. It's OK if the final constructed vector is used multiple times
	if (!IE->hasOneUse())
	return false;

	IE = NextUse;
	}

	return false;
	}

	/// \brief Like findBuildVector, but looks backwards for construction of aggregate.
	///
	/// \return true if it matches.
	static bool findBuildAggregate(InsertValueInst *IV,
	SmallVectorImpl<Value *> &BuildVector,
	SmallVectorImpl<Value *> &BuildVectorOpds) {
	if (!IV->hasOneUse())
	return false;
	Value *V = IV->getAggregateOperand();
	if (!isa<UndefValue>(V)) {
	InsertValueInst *I = dyn_cast<InsertValueInst>(V);
	if (!I \|\| !findBuildAggregate(I, BuildVector, BuildVectorOpds))
	return false;
	}
	BuildVector.push_back(IV);
	BuildVectorOpds.push_back(IV->getInsertedValueOperand());
	return true;
	}

	static bool PhiTypeSorterFunc(Value V, Value V2) {
	return V->getType() < V2->getType();
	}

	/// \brief Try and get a reduction value from a phi node.
	///
	/// Given a phi node \p P in a block \p ParentBB, consider possible reductions
	/// if they come from either \p ParentBB or a containing loop latch.
	///
	/// \returns A candidate reduction value if possible, or \code nullptr \endcode
	/// if not possible.
	static Value getReductionValue(const DominatorTree DT, PHINode *P,
	BasicBlock ParentBB, LoopInfo LI) {
	// There are situations where the reduction value is not dominated by the
	// reduction phi. Vectorizing such cases has been reported to cause
	// miscompiles. See PR25787.
	auto DominatedReduxValue = [&](Value *R) {
	return (
	dyn_cast<Instruction>(R) &&
	DT->dominates(P->getParent(), dyn_cast<Instruction>(R)->getParent()));
	};

	Value *Rdx = nullptr;

	// Return the incoming value if it comes from the same BB as the phi node.
	if (P->getIncomingBlock(0) == ParentBB) {
	Rdx = P->getIncomingValue(0);
	} else if (P->getIncomingBlock(1) == ParentBB) {
	Rdx = P->getIncomingValue(1);
	}

	if (Rdx && DominatedReduxValue(Rdx))
	return Rdx;

	// Otherwise, check whether we have a loop latch to look at.
	Loop *BBL = LI->getLoopFor(ParentBB);
	if (!BBL)
	return nullptr;
	BasicBlock *BBLatch = BBL->getLoopLatch();
	if (!BBLatch)
	return nullptr;

	// There is a loop latch, return the incoming value if it comes from
	// that. This reduction pattern occasionally turns up.
	if (P->getIncomingBlock(0) == BBLatch) {
	Rdx = P->getIncomingValue(0);
	} else if (P->getIncomingBlock(1) == BBLatch) {
	Rdx = P->getIncomingValue(1);
	}

	if (Rdx && DominatedReduxValue(Rdx))
	return Rdx;

	return nullptr;
	}

	-namespace {
	-/// Tracks instructons and its children.
	-class WeakVHWithLevel final : public CallbackVH {
	- /// Operand index of the instruction currently beeing analized.
	- unsigned Level = 0;
	- /// Is this the instruction that should be vectorized, or are we now
	- /// processing children (i.e. operands of this instruction) for potential
	- /// vectorization?
	- bool IsInitial = true;
	-
	-public:
	- explicit WeakVHWithLevel() = default;
	- WeakVHWithLevel(Value *V) : CallbackVH(V){};
	- /// Restart children analysis each time it is repaced by the new instruction.
	- void allUsesReplacedWith(Value *New) override {
	- setValPtr(New);
	- Level = 0;
	- IsInitial = true;
	- }
	- /// Check if the instruction was not deleted during vectorization.
	- bool isValid() const { return !getValPtr(); }
	- /// Is the istruction itself must be vectorized?
	- bool isInitial() const { return IsInitial; }
	- /// Try to vectorize children.
	- void clearInitial() { IsInitial = false; }
	- /// Are all children processed already?
	- bool isFinal() const {
	- assert(getValPtr() &&
	- (isa<Instruction>(getValPtr()) &&
	- cast<Instruction>(getValPtr())->getNumOperands() >= Level));
	- return getValPtr() &&
	- cast<Instruction>(getValPtr())->getNumOperands() == Level;
	- }
	- /// Get next child operation.
	- Value *nextOperand() {
	- assert(getValPtr() && isa<Instruction>(getValPtr()) &&
	- cast<Instruction>(getValPtr())->getNumOperands() > Level);
	- return cast<Instruction>(getValPtr())->getOperand(Level++);
	- }
	- virtual ~WeakVHWithLevel() = default;
	-};
	-} // namespace
	-
	/// \brief Attempt to reduce a horizontal reduction.
	/// If it is legal to match a horizontal reduction feeding
	-/// the phi node P with reduction operators Root in a basic block BB, then check
	-/// if it can be done.
	+/// the phi node P with reduction operators BI, then check if it
	+/// can be done.
	/// \returns true if a horizontal reduction was matched and reduced.
	/// \returns false if a horizontal reduction was not matched.
	-static bool canBeVectorized(
	- PHINode P, Instruction Root, BasicBlock *BB, BoUpSLP &R,
	- TargetTransformInfo *TTI,
	- const function_ref<bool(BinaryOperator *, BoUpSLP &)> Vectorize) {
	+static bool canMatchHorizontalReduction(PHINode P, BinaryOperator BI,
	+ BoUpSLP &R, TargetTransformInfo *TTI,
	+ unsigned MinRegSize) {
	if (!ShouldVectorizeHor)
	return false;

	- if (!Root)
	+ HorizontalReduction HorRdx(MinRegSize);
	+ if (!HorRdx.matchAssociativeReduction(P, BI))
	return false;

	- if (Root->getParent() != BB)
	- return false;
	- SmallVector<WeakVHWithLevel, 8> Stack(1, Root);
	- SmallSet<Value *, 8> VisitedInstrs;
	- bool Res = false;
	- while (!Stack.empty()) {
	- Value *V = Stack.back();
	- if (!V) {
	- Stack.pop_back();
	- continue;
	- }
	- auto *Inst = dyn_cast<Instruction>(V);
	- if (!Inst \|\| isa<PHINode>(Inst)) {
	- Stack.pop_back();
	- continue;
	- }
	- if (Stack.back().isInitial()) {
	- Stack.back().clearInitial();
	- if (auto *BI = dyn_cast<BinaryOperator>(Inst)) {
	- HorizontalReduction HorRdx(R.getMinVecRegSize());
	- if (HorRdx.matchAssociativeReduction(P, BI)) {
	- // If there is a sufficient number of reduction values, reduce
	- // to a nearby power-of-2. Can safely generate oversized
	- // vectors and rely on the backend to split them to legal sizes.
	- HorRdx.ReduxWidth =
	- std::max((uint64_t)4, PowerOf2Floor(HorRdx.numReductionValues()));
	+ // If there is a sufficient number of reduction values, reduce
	+ // to a nearby power-of-2. Can safely generate oversized
	+ // vectors and rely on the backend to split them to legal sizes.
	+ HorRdx.ReduxWidth =
	+ std::max((uint64_t)4, PowerOf2Floor(HorRdx.numReductionValues()));

	- if (HorRdx.tryToReduce(R, TTI)) {
	- Res = true;
	- P = nullptr;
	- continue;
	- }
	- }
	- if (P) {
	- Inst = dyn_cast<Instruction>(BI->getOperand(0));
	- if (Inst == P)
	- Inst = dyn_cast<Instruction>(BI->getOperand(1));
	- if (!Inst) {
	- P = nullptr;
	- continue;
	- }
	- }
	- }
	- P = nullptr;
	- if (Vectorize(dyn_cast<BinaryOperator>(Inst), R)) {
	- Res = true;
	- continue;
	- }
	- }
	- if (Stack.back().isFinal()) {
	- Stack.pop_back();
	- continue;
	- }
	-
	- if (auto *NextV = dyn_cast<Instruction>(Stack.back().nextOperand()))
	- if (NextV->getParent() == BB && VisitedInstrs.insert(NextV).second &&
	- Stack.size() < RecursionMaxDepth)
	- Stack.push_back(NextV);
	- }
	- return Res;
	+ return HorRdx.tryToReduce(R, TTI);
	}

	-bool SLPVectorizerPass::vectorizeRootInstruction(PHINode P, Value V,
	- BasicBlock *BB, BoUpSLP &R,
	- TargetTransformInfo *TTI) {
	- if (!V)
	- return false;
	- auto *I = dyn_cast<Instruction>(V);
	- if (!I)
	- return false;
	-
	- if (!isa<BinaryOperator>(I))
	- P = nullptr;
	- // Try to match and vectorize a horizontal reduction.
	- return canBeVectorized(P, I, BB, R, TTI,
	- [this](BinaryOperator *BI, BoUpSLP &R) -> bool {
	- return tryToVectorize(BI, R);
	- });
	-}
	-
	bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
	bool Changed = false;
	SmallVector<Value *, 4> Incoming;
	SmallSet<Value *, 16> VisitedInstrs;

	bool HaveVectorizedPhiNodes = true;
	while (HaveVectorizedPhiNodes) {
	HaveVectorizedPhiNodes = false;

	// Collect the incoming values from the PHIs.
	Incoming.clear();
	for (Instruction &I : *BB) {
	PHINode *P = dyn_cast<PHINode>(&I);
	if (!P)
	break;

	if (!VisitedInstrs.count(P))
	Incoming.push_back(P);
	}

	// Sort by type.
	std::stable_sort(Incoming.begin(), Incoming.end(), PhiTypeSorterFunc);

	// Try to vectorize elements base on their type.
	for (SmallVector<Value *, 4>::iterator IncIt = Incoming.begin(),
	E = Incoming.end();
	IncIt != E;) {

	// Look for the next elements with the same type.
	SmallVector<Value *, 4>::iterator SameTypeIt = IncIt;
	while (SameTypeIt != E &&
	(SameTypeIt)->getType() == (IncIt)->getType()) {
	VisitedInstrs.insert(*SameTypeIt);
	++SameTypeIt;
	}

	// Try to vectorize them.
	unsigned NumElts = (SameTypeIt - IncIt);
	DEBUG(errs() << "SLP: Trying to vectorize starting at PHIs (" << NumElts << ")\n");
	if (NumElts > 1 && tryToVectorizeList(makeArrayRef(IncIt, NumElts), R)) {
	// Success start over because instructions might have been changed.
	HaveVectorizedPhiNodes = true;
	Changed = true;
	break;
	}

	// Start over at the next instruction of a different type (or the end).
	IncIt = SameTypeIt;
	}
	}

	VisitedInstrs.clear();

	for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; it++) {
	// We may go through BB multiple times so skip the one we have checked.
	if (!VisitedInstrs.insert(&*it).second)
	continue;

	if (isa<DbgInfoIntrinsic>(it))
	continue;

	// Try to vectorize reductions that use PHINodes.
	if (PHINode *P = dyn_cast<PHINode>(it)) {
	// Check that the PHI is a reduction PHI.
	if (P->getNumIncomingValues() != 2)
	return Changed;

	+ Value *Rdx = getReductionValue(DT, P, BB, LI);
	+
	+ // Check if this is a Binary Operator.
	+ BinaryOperator *BI = dyn_cast_or_null<BinaryOperator>(Rdx);
	+ if (!BI)
	+ continue;
	+
	// Try to match and vectorize a horizontal reduction.
	- if (vectorizeRootInstruction(P, getReductionValue(DT, P, BB, LI), BB, R,
	- TTI)) {
	+ if (canMatchHorizontalReduction(P, BI, R, TTI, R.getMinVecRegSize())) {
	Changed = true;
	it = BB->begin();
	e = BB->end();
	continue;
	}
	+
	+ Value *Inst = BI->getOperand(0);
	+ if (Inst == P)
	+ Inst = BI->getOperand(1);
	+
	+ if (tryToVectorize(dyn_cast<BinaryOperator>(Inst), R)) {
	+ // We would like to start over since some instructions are deleted
	+ // and the iterator may become invalid value.
	+ Changed = true;
	+ it = BB->begin();
	+ e = BB->end();
	+ continue;
	+ }
	+
	continue;
	}

	- if (ShouldStartVectorizeHorAtStore) {
	- if (StoreInst *SI = dyn_cast<StoreInst>(it)) {
	- // Try to match and vectorize a horizontal reduction.
	- if (vectorizeRootInstruction(nullptr, SI->getValueOperand(), BB, R,
	- TTI)) {
	- Changed = true;
	- it = BB->begin();
	- e = BB->end();
	- continue;
	+ if (ShouldStartVectorizeHorAtStore)
	+ if (StoreInst *SI = dyn_cast<StoreInst>(it))
	+ if (BinaryOperator *BinOp =
	+ dyn_cast<BinaryOperator>(SI->getValueOperand())) {
	+ if (canMatchHorizontalReduction(nullptr, BinOp, R, TTI,
	+ R.getMinVecRegSize()) \|\|
	+ tryToVectorize(BinOp, R)) {
	+ Changed = true;
	+ it = BB->begin();
	+ e = BB->end();
	+ continue;
	+ }
	}
	- }
	- }

	// Try to vectorize horizontal reductions feeding into a return.
	- if (ReturnInst *RI = dyn_cast<ReturnInst>(it)) {
	- if (RI->getNumOperands() != 0) {
	- // Try to match and vectorize a horizontal reduction.
	- if (vectorizeRootInstruction(nullptr, RI->getOperand(0), BB, R, TTI)) {
	- Changed = true;
	- it = BB->begin();
	- e = BB->end();
	- continue;
	+ if (ReturnInst *RI = dyn_cast<ReturnInst>(it))
	+ if (RI->getNumOperands() != 0)
	+ if (BinaryOperator *BinOp =
	+ dyn_cast<BinaryOperator>(RI->getOperand(0))) {
	+ DEBUG(dbgs() << "SLP: Found a return to vectorize.\n");
	+ if (canMatchHorizontalReduction(nullptr, BinOp, R, TTI,
	+ R.getMinVecRegSize()) \|\|
	+ tryToVectorizePair(BinOp->getOperand(0), BinOp->getOperand(1),
	+ R)) {
	+ Changed = true;
	+ it = BB->begin();
	+ e = BB->end();
	+ continue;
	+ }
	}
	- }
	- }

	// Try to vectorize trees that start at compare instructions.
	if (CmpInst *CI = dyn_cast<CmpInst>(it)) {
	if (tryToVectorizePair(CI->getOperand(0), CI->getOperand(1), R)) {
	Changed = true;
	// We would like to start over since some instructions are deleted
	// and the iterator may become invalid value.
	it = BB->begin();
	e = BB->end();
	continue;
	}

	- for (int I = 0; I < 2; ++I) {
	- if (vectorizeRootInstruction(nullptr, CI->getOperand(I), BB, R, TTI)) {
	- Changed = true;
	- // We would like to start over since some instructions are deleted
	- // and the iterator may become invalid value.
	- it = BB->begin();
	- e = BB->end();
	- break;
	+ for (int i = 0; i < 2; ++i) {
	+ if (BinaryOperator *BI = dyn_cast<BinaryOperator>(CI->getOperand(i))) {
	+ if (tryToVectorizePair(BI->getOperand(0), BI->getOperand(1), R)) {
	+ Changed = true;
	+ // We would like to start over since some instructions are deleted
	+ // and the iterator may become invalid value.
	+ it = BB->begin();
	+ e = BB->end();
	+ break;
	+ }
	}
	}
	continue;
	}

	// Try to vectorize trees that start at insertelement instructions.
	if (InsertElementInst *FirstInsertElem = dyn_cast<InsertElementInst>(it)) {
	SmallVector<Value *, 16> BuildVector;
	SmallVector<Value *, 16> BuildVectorOpds;
	if (!findBuildVector(FirstInsertElem, BuildVector, BuildVectorOpds))
	continue;

	// Vectorize starting with the build vector operands ignoring the
	// BuildVector instructions for the purpose of scheduling and user
	// extraction.
	if (tryToVectorizeList(BuildVectorOpds, R, BuildVector)) {
	Changed = true;
	it = BB->begin();
	e = BB->end();
	}

	continue;
	}

	// Try to vectorize trees that start at insertvalue instructions feeding into
	// a store.
	if (StoreInst *SI = dyn_cast<StoreInst>(it)) {
	if (InsertValueInst *LastInsertValue = dyn_cast<InsertValueInst>(SI->getValueOperand())) {
	const DataLayout &DL = BB->getModule()->getDataLayout();
	if (R.canMapToVector(SI->getValueOperand()->getType(), DL)) {
	SmallVector<Value *, 16> BuildVector;
	SmallVector<Value *, 16> BuildVectorOpds;
	if (!findBuildAggregate(LastInsertValue, BuildVector, BuildVectorOpds))
	continue;

	DEBUG(dbgs() << "SLP: store of array mappable to vector: " << *SI << "\n");
	if (tryToVectorizeList(BuildVectorOpds, R, BuildVector, false)) {
	Changed = true;
	it = BB->begin();
	e = BB->end();
	}
	continue;
	}
	}
	}
	}

	return Changed;
	}

	bool SLPVectorizerPass::vectorizeGEPIndices(BasicBlock *BB, BoUpSLP &R) {
	auto Changed = false;
	for (auto &Entry : GEPs) {

	// If the getelementptr list has fewer than two elements, there's nothing
	// to do.
	if (Entry.second.size() < 2)
	continue;

	DEBUG(dbgs() << "SLP: Analyzing a getelementptr list of length "
	<< Entry.second.size() << ".\n");

	// We process the getelementptr list in chunks of 16 (like we do for
	// stores) to minimize compile-time.
	for (unsigned BI = 0, BE = Entry.second.size(); BI < BE; BI += 16) {
	auto Len = std::min<unsigned>(BE - BI, 16);
	auto GEPList = makeArrayRef(&Entry.second[BI], Len);

	// Initialize a set a candidate getelementptrs. Note that we use a
	// SetVector here to preserve program order. If the index computations
	// are vectorizable and begin with loads, we want to minimize the chance
	// of having to reorder them later.
	SetVector<Value *> Candidates(GEPList.begin(), GEPList.end());

	// Some of the candidates may have already been vectorized after we
	// initially collected them. If so, the WeakVHs will have nullified the
	// values, so remove them from the set of candidates.
	Candidates.remove(nullptr);

	// Remove from the set of candidates all pairs of getelementptrs with
	// constant differences. Such getelementptrs are likely not good
	// candidates for vectorization in a bottom-up phase since one can be
	// computed from the other. We also ensure all candidate getelementptr
	// indices are unique.
	for (int I = 0, E = GEPList.size(); I < E && Candidates.size() > 1; ++I) {
	auto *GEPI = cast<GetElementPtrInst>(GEPList[I]);
	if (!Candidates.count(GEPI))
	continue;
	auto *SCEVI = SE->getSCEV(GEPList[I]);
	for (int J = I + 1; J < E && Candidates.size() > 1; ++J) {
	auto *GEPJ = cast<GetElementPtrInst>(GEPList[J]);
	auto *SCEVJ = SE->getSCEV(GEPList[J]);
	if (isa<SCEVConstant>(SE->getMinusSCEV(SCEVI, SCEVJ))) {
	Candidates.remove(GEPList[I]);
	Candidates.remove(GEPList[J]);
	} else if (GEPI->idx_begin()->get() == GEPJ->idx_begin()->get()) {
	Candidates.remove(GEPList[J]);
	}
	}
	}

	// We break out of the above computation as soon as we know there are
	// fewer than two candidates remaining.
	if (Candidates.size() < 2)
	continue;

	// Add the single, non-constant index of each candidate to the bundle. We
	// ensured the indices met these constraints when we originally collected
	// the getelementptrs.
	SmallVector<Value *, 16> Bundle(Candidates.size());
	auto BundleIndex = 0u;
	for (auto *V : Candidates) {
	auto *GEP = cast<GetElementPtrInst>(V);
	auto *GEPIdx = GEP->idx_begin()->get();
	assert(GEP->getNumIndices() == 1 \|\| !isa<Constant>(GEPIdx));
	Bundle[BundleIndex++] = GEPIdx;
	}

	// Try and vectorize the indices. We are currently only interested in
	// gather-like cases of the form:
	//
	// ... = g[a[0] - b[0]] + g[a[1] - b[1]] + ...
	//
	// where the loads of "a", the loads of "b", and the subtractions can be
	// performed in parallel. It's likely that detecting this pattern in a
	// bottom-up phase will be simpler and less costly than building a
	// full-blown top-down phase beginning at the consecutive loads.
	Changed \|= tryToVectorizeList(Bundle, R);
	}
	}
	return Changed;
	}

	bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) {
	bool Changed = false;
	// Attempt to sort and vectorize each of the store-groups.
	for (StoreListMap::iterator it = Stores.begin(), e = Stores.end(); it != e;
	++it) {
	if (it->second.size() < 2)
	continue;

	DEBUG(dbgs() << "SLP: Analyzing a store chain of length "
	<< it->second.size() << ".\n");

	// Process the stores in chunks of 16.
	// TODO: The limit of 16 inhibits greater vectorization factors.
	// For example, AVX2 supports v32i8. Increasing this limit, however,
	// may cause a significant compile-time increase.
	for (unsigned CI = 0, CE = it->second.size(); CI < CE; CI+=16) {
	unsigned Len = std::min<unsigned>(CE - CI, 16);
	Changed \|= vectorizeStores(makeArrayRef(&it->second[CI], Len), R);
	}
	}
	return Changed;
	}

	char SLPVectorizer::ID = 0;
	static const char lv_name[] = "SLP Vectorizer";
	INITIALIZE_PASS_BEGIN(SLPVectorizer, SV_NAME, lv_name, false, false)
	INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
	INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
	INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
	INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
	INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
	INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)
	INITIALIZE_PASS_END(SLPVectorizer, SV_NAME, lv_name, false, false)

	namespace llvm {
	Pass *createSLPVectorizerPass() { return new SLPVectorizer(); }
	}
	Index: projects/clang400-import/contrib/llvm/tools/clang/lib/CodeGen/CGOpenMPRuntime.cpp
	===================================================================
	--- projects/clang400-import/contrib/llvm/tools/clang/lib/CodeGen/CGOpenMPRuntime.cpp (revision 314268)
	+++ projects/clang400-import/contrib/llvm/tools/clang/lib/CodeGen/CGOpenMPRuntime.cpp (revision 314269)
	@@ -1,6795 +1,6797 @@
	//===----- CGOpenMPRuntime.cpp - Interface to OpenMP Runtimes -------------===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	//
	// This provides a class for OpenMP runtime code generation.
	//
	//===----------------------------------------------------------------------===//

	#include "CGCXXABI.h"
	#include "CGCleanup.h"
	#include "CGOpenMPRuntime.h"
	#include "CodeGenFunction.h"
	#include "ConstantBuilder.h"
	#include "clang/AST/Decl.h"
	#include "clang/AST/StmtOpenMP.h"
	#include "llvm/ADT/ArrayRef.h"
	#include "llvm/Bitcode/BitcodeReader.h"
	#include "llvm/IR/CallSite.h"
	#include "llvm/IR/DerivedTypes.h"
	#include "llvm/IR/GlobalValue.h"
	#include "llvm/IR/Value.h"
	#include "llvm/Support/Format.h"
	#include "llvm/Support/raw_ostream.h"
	#include <cassert>

	using namespace clang;
	using namespace CodeGen;

	namespace {
	/// \brief Base class for handling code generation inside OpenMP regions.
	class CGOpenMPRegionInfo : public CodeGenFunction::CGCapturedStmtInfo {
	public:
	/// \brief Kinds of OpenMP regions used in codegen.
	enum CGOpenMPRegionKind {
	/// \brief Region with outlined function for standalone 'parallel'
	/// directive.
	ParallelOutlinedRegion,
	/// \brief Region with outlined function for standalone 'task' directive.
	TaskOutlinedRegion,
	/// \brief Region for constructs that do not require function outlining,
	/// like 'for', 'sections', 'atomic' etc. directives.
	InlinedRegion,
	/// \brief Region with outlined function for standalone 'target' directive.
	TargetRegion,
	};

	CGOpenMPRegionInfo(const CapturedStmt &CS,
	const CGOpenMPRegionKind RegionKind,
	const RegionCodeGenTy &CodeGen, OpenMPDirectiveKind Kind,
	bool HasCancel)
	: CGCapturedStmtInfo(CS, CR_OpenMP), RegionKind(RegionKind),
	CodeGen(CodeGen), Kind(Kind), HasCancel(HasCancel) {}

	CGOpenMPRegionInfo(const CGOpenMPRegionKind RegionKind,
	const RegionCodeGenTy &CodeGen, OpenMPDirectiveKind Kind,
	bool HasCancel)
	: CGCapturedStmtInfo(CR_OpenMP), RegionKind(RegionKind), CodeGen(CodeGen),
	Kind(Kind), HasCancel(HasCancel) {}

	/// \brief Get a variable or parameter for storing global thread id
	/// inside OpenMP construct.
	virtual const VarDecl *getThreadIDVariable() const = 0;

	/// \brief Emit the captured statement body.
	void EmitBody(CodeGenFunction &CGF, const Stmt *S) override;

	/// \brief Get an LValue for the current ThreadID variable.
	/// \return LValue for thread id variable. This LValue always has type int32*.
	virtual LValue getThreadIDVariableLValue(CodeGenFunction &CGF);

	virtual void emitUntiedSwitch(CodeGenFunction & /CGF/) {}

	CGOpenMPRegionKind getRegionKind() const { return RegionKind; }

	OpenMPDirectiveKind getDirectiveKind() const { return Kind; }

	bool hasCancel() const { return HasCancel; }

	static bool classof(const CGCapturedStmtInfo *Info) {
	return Info->getKind() == CR_OpenMP;
	}

	~CGOpenMPRegionInfo() override = default;

	protected:
	CGOpenMPRegionKind RegionKind;
	RegionCodeGenTy CodeGen;
	OpenMPDirectiveKind Kind;
	bool HasCancel;
	};

	/// \brief API for captured statement code generation in OpenMP constructs.
	class CGOpenMPOutlinedRegionInfo final : public CGOpenMPRegionInfo {
	public:
	CGOpenMPOutlinedRegionInfo(const CapturedStmt &CS, const VarDecl *ThreadIDVar,
	const RegionCodeGenTy &CodeGen,
	OpenMPDirectiveKind Kind, bool HasCancel,
	StringRef HelperName)
	: CGOpenMPRegionInfo(CS, ParallelOutlinedRegion, CodeGen, Kind,
	HasCancel),
	ThreadIDVar(ThreadIDVar), HelperName(HelperName) {
	assert(ThreadIDVar != nullptr && "No ThreadID in OpenMP region.");
	}

	/// \brief Get a variable or parameter for storing global thread id
	/// inside OpenMP construct.
	const VarDecl *getThreadIDVariable() const override { return ThreadIDVar; }

	/// \brief Get the name of the capture helper.
	StringRef getHelperName() const override { return HelperName; }

	static bool classof(const CGCapturedStmtInfo *Info) {
	return CGOpenMPRegionInfo::classof(Info) &&
	cast<CGOpenMPRegionInfo>(Info)->getRegionKind() ==
	ParallelOutlinedRegion;
	}

	private:
	/// \brief A variable or parameter storing global thread id for OpenMP
	/// constructs.
	const VarDecl *ThreadIDVar;
	StringRef HelperName;
	};

	/// \brief API for captured statement code generation in OpenMP constructs.
	class CGOpenMPTaskOutlinedRegionInfo final : public CGOpenMPRegionInfo {
	public:
	class UntiedTaskActionTy final : public PrePostActionTy {
	bool Untied;
	const VarDecl *PartIDVar;
	const RegionCodeGenTy UntiedCodeGen;
	llvm::SwitchInst *UntiedSwitch = nullptr;

	public:
	UntiedTaskActionTy(bool Tied, const VarDecl *PartIDVar,
	const RegionCodeGenTy &UntiedCodeGen)
	: Untied(!Tied), PartIDVar(PartIDVar), UntiedCodeGen(UntiedCodeGen) {}
	void Enter(CodeGenFunction &CGF) override {
	if (Untied) {
	// Emit task switching point.
	auto PartIdLVal = CGF.EmitLoadOfPointerLValue(
	CGF.GetAddrOfLocalVar(PartIDVar),
	PartIDVar->getType()->castAs<PointerType>());
	auto *Res = CGF.EmitLoadOfScalar(PartIdLVal, SourceLocation());
	auto *DoneBB = CGF.createBasicBlock(".untied.done.");
	UntiedSwitch = CGF.Builder.CreateSwitch(Res, DoneBB);
	CGF.EmitBlock(DoneBB);
	CGF.EmitBranchThroughCleanup(CGF.ReturnBlock);
	CGF.EmitBlock(CGF.createBasicBlock(".untied.jmp."));
	UntiedSwitch->addCase(CGF.Builder.getInt32(0),
	CGF.Builder.GetInsertBlock());
	emitUntiedSwitch(CGF);
	}
	}
	void emitUntiedSwitch(CodeGenFunction &CGF) const {
	if (Untied) {
	auto PartIdLVal = CGF.EmitLoadOfPointerLValue(
	CGF.GetAddrOfLocalVar(PartIDVar),
	PartIDVar->getType()->castAs<PointerType>());
	CGF.EmitStoreOfScalar(CGF.Builder.getInt32(UntiedSwitch->getNumCases()),
	PartIdLVal);
	UntiedCodeGen(CGF);
	CodeGenFunction::JumpDest CurPoint =
	CGF.getJumpDestInCurrentScope(".untied.next.");
	CGF.EmitBranchThroughCleanup(CGF.ReturnBlock);
	CGF.EmitBlock(CGF.createBasicBlock(".untied.jmp."));
	UntiedSwitch->addCase(CGF.Builder.getInt32(UntiedSwitch->getNumCases()),
	CGF.Builder.GetInsertBlock());
	CGF.EmitBranchThroughCleanup(CurPoint);
	CGF.EmitBlock(CurPoint.getBlock());
	}
	}
	unsigned getNumberOfParts() const { return UntiedSwitch->getNumCases(); }
	};
	CGOpenMPTaskOutlinedRegionInfo(const CapturedStmt &CS,
	const VarDecl *ThreadIDVar,
	const RegionCodeGenTy &CodeGen,
	OpenMPDirectiveKind Kind, bool HasCancel,
	const UntiedTaskActionTy &Action)
	: CGOpenMPRegionInfo(CS, TaskOutlinedRegion, CodeGen, Kind, HasCancel),
	ThreadIDVar(ThreadIDVar), Action(Action) {
	assert(ThreadIDVar != nullptr && "No ThreadID in OpenMP region.");
	}

	/// \brief Get a variable or parameter for storing global thread id
	/// inside OpenMP construct.
	const VarDecl *getThreadIDVariable() const override { return ThreadIDVar; }

	/// \brief Get an LValue for the current ThreadID variable.
	LValue getThreadIDVariableLValue(CodeGenFunction &CGF) override;

	/// \brief Get the name of the capture helper.
	StringRef getHelperName() const override { return ".omp_outlined."; }

	void emitUntiedSwitch(CodeGenFunction &CGF) override {
	Action.emitUntiedSwitch(CGF);
	}

	static bool classof(const CGCapturedStmtInfo *Info) {
	return CGOpenMPRegionInfo::classof(Info) &&
	cast<CGOpenMPRegionInfo>(Info)->getRegionKind() ==
	TaskOutlinedRegion;
	}

	private:
	/// \brief A variable or parameter storing global thread id for OpenMP
	/// constructs.
	const VarDecl *ThreadIDVar;
	/// Action for emitting code for untied tasks.
	const UntiedTaskActionTy &Action;
	};

	/// \brief API for inlined captured statement code generation in OpenMP
	/// constructs.
	class CGOpenMPInlinedRegionInfo : public CGOpenMPRegionInfo {
	public:
	CGOpenMPInlinedRegionInfo(CodeGenFunction::CGCapturedStmtInfo *OldCSI,
	const RegionCodeGenTy &CodeGen,
	OpenMPDirectiveKind Kind, bool HasCancel)
	: CGOpenMPRegionInfo(InlinedRegion, CodeGen, Kind, HasCancel),
	OldCSI(OldCSI),
	OuterRegionInfo(dyn_cast_or_null<CGOpenMPRegionInfo>(OldCSI)) {}

	// \brief Retrieve the value of the context parameter.
	llvm::Value *getContextValue() const override {
	if (OuterRegionInfo)
	return OuterRegionInfo->getContextValue();
	llvm_unreachable("No context value for inlined OpenMP region");
	}

	void setContextValue(llvm::Value *V) override {
	if (OuterRegionInfo) {
	OuterRegionInfo->setContextValue(V);
	return;
	}
	llvm_unreachable("No context value for inlined OpenMP region");
	}

	/// \brief Lookup the captured field decl for a variable.
	const FieldDecl lookup(const VarDecl VD) const override {
	if (OuterRegionInfo)
	return OuterRegionInfo->lookup(VD);
	// If there is no outer outlined region,no need to lookup in a list of
	// captured variables, we can use the original one.
	return nullptr;
	}

	FieldDecl *getThisFieldDecl() const override {
	if (OuterRegionInfo)
	return OuterRegionInfo->getThisFieldDecl();
	return nullptr;
	}

	/// \brief Get a variable or parameter for storing global thread id
	/// inside OpenMP construct.
	const VarDecl *getThreadIDVariable() const override {
	if (OuterRegionInfo)
	return OuterRegionInfo->getThreadIDVariable();
	return nullptr;
	}

	/// \brief Get the name of the capture helper.
	StringRef getHelperName() const override {
	if (auto *OuterRegionInfo = getOldCSI())
	return OuterRegionInfo->getHelperName();
	llvm_unreachable("No helper name for inlined OpenMP construct");
	}

	void emitUntiedSwitch(CodeGenFunction &CGF) override {
	if (OuterRegionInfo)
	OuterRegionInfo->emitUntiedSwitch(CGF);
	}

	CodeGenFunction::CGCapturedStmtInfo *getOldCSI() const { return OldCSI; }

	static bool classof(const CGCapturedStmtInfo *Info) {
	return CGOpenMPRegionInfo::classof(Info) &&
	cast<CGOpenMPRegionInfo>(Info)->getRegionKind() == InlinedRegion;
	}

	~CGOpenMPInlinedRegionInfo() override = default;

	private:
	/// \brief CodeGen info about outer OpenMP region.
	CodeGenFunction::CGCapturedStmtInfo *OldCSI;
	CGOpenMPRegionInfo *OuterRegionInfo;
	};

	/// \brief API for captured statement code generation in OpenMP target
	/// constructs. For this captures, implicit parameters are used instead of the
	/// captured fields. The name of the target region has to be unique in a given
	/// application so it is provided by the client, because only the client has
	/// the information to generate that.
	class CGOpenMPTargetRegionInfo final : public CGOpenMPRegionInfo {
	public:
	CGOpenMPTargetRegionInfo(const CapturedStmt &CS,
	const RegionCodeGenTy &CodeGen, StringRef HelperName)
	: CGOpenMPRegionInfo(CS, TargetRegion, CodeGen, OMPD_target,
	/HasCancel=/false),
	HelperName(HelperName) {}

	/// \brief This is unused for target regions because each starts executing
	/// with a single thread.
	const VarDecl *getThreadIDVariable() const override { return nullptr; }

	/// \brief Get the name of the capture helper.
	StringRef getHelperName() const override { return HelperName; }

	static bool classof(const CGCapturedStmtInfo *Info) {
	return CGOpenMPRegionInfo::classof(Info) &&
	cast<CGOpenMPRegionInfo>(Info)->getRegionKind() == TargetRegion;
	}

	private:
	StringRef HelperName;
	};

	static void EmptyCodeGen(CodeGenFunction &, PrePostActionTy &) {
	llvm_unreachable("No codegen for expressions");
	}
	/// \brief API for generation of expressions captured in a innermost OpenMP
	/// region.
	class CGOpenMPInnerExprInfo final : public CGOpenMPInlinedRegionInfo {
	public:
	CGOpenMPInnerExprInfo(CodeGenFunction &CGF, const CapturedStmt &CS)
	: CGOpenMPInlinedRegionInfo(CGF.CapturedStmtInfo, EmptyCodeGen,
	OMPD_unknown,
	/HasCancel=/false),
	PrivScope(CGF) {
	// Make sure the globals captured in the provided statement are local by
	// using the privatization logic. We assume the same variable is not
	// captured more than once.
	for (auto &C : CS.captures()) {
	if (!C.capturesVariable() && !C.capturesVariableByCopy())
	continue;

	const VarDecl *VD = C.getCapturedVar();
	if (VD->isLocalVarDeclOrParm())
	continue;

	DeclRefExpr DRE(const_cast<VarDecl *>(VD),
	/RefersToEnclosingVariableOrCapture=/false,
	VD->getType().getNonReferenceType(), VK_LValue,
	SourceLocation());
	PrivScope.addPrivate(VD, [&CGF, &DRE]() -> Address {
	return CGF.EmitLValue(&DRE).getAddress();
	});
	}
	(void)PrivScope.Privatize();
	}

	/// \brief Lookup the captured field decl for a variable.
	const FieldDecl lookup(const VarDecl VD) const override {
	if (auto *FD = CGOpenMPInlinedRegionInfo::lookup(VD))
	return FD;
	return nullptr;
	}

	/// \brief Emit the captured statement body.
	void EmitBody(CodeGenFunction &CGF, const Stmt *S) override {
	llvm_unreachable("No body for expressions");
	}

	/// \brief Get a variable or parameter for storing global thread id
	/// inside OpenMP construct.
	const VarDecl *getThreadIDVariable() const override {
	llvm_unreachable("No thread id for expressions");
	}

	/// \brief Get the name of the capture helper.
	StringRef getHelperName() const override {
	llvm_unreachable("No helper name for expressions");
	}

	static bool classof(const CGCapturedStmtInfo *Info) { return false; }

	private:
	/// Private scope to capture global variables.
	CodeGenFunction::OMPPrivateScope PrivScope;
	};

	/// \brief RAII for emitting code of OpenMP constructs.
	class InlinedOpenMPRegionRAII {
	CodeGenFunction &CGF;
	llvm::DenseMap<const VarDecl , FieldDecl > LambdaCaptureFields;
	FieldDecl *LambdaThisCaptureField = nullptr;

	public:
	/// \brief Constructs region for combined constructs.
	/// \param CodeGen Code generation sequence for combined directives. Includes
	/// a list of functions used for code generation of implicitly inlined
	/// regions.
	InlinedOpenMPRegionRAII(CodeGenFunction &CGF, const RegionCodeGenTy &CodeGen,
	OpenMPDirectiveKind Kind, bool HasCancel)
	: CGF(CGF) {
	// Start emission for the construct.
	CGF.CapturedStmtInfo = new CGOpenMPInlinedRegionInfo(
	CGF.CapturedStmtInfo, CodeGen, Kind, HasCancel);
	std::swap(CGF.LambdaCaptureFields, LambdaCaptureFields);
	LambdaThisCaptureField = CGF.LambdaThisCaptureField;
	CGF.LambdaThisCaptureField = nullptr;
	}

	~InlinedOpenMPRegionRAII() {
	// Restore original CapturedStmtInfo only if we're done with code emission.
	auto *OldCSI =
	cast<CGOpenMPInlinedRegionInfo>(CGF.CapturedStmtInfo)->getOldCSI();
	delete CGF.CapturedStmtInfo;
	CGF.CapturedStmtInfo = OldCSI;
	std::swap(CGF.LambdaCaptureFields, LambdaCaptureFields);
	CGF.LambdaThisCaptureField = LambdaThisCaptureField;
	}
	};

	/// \brief Values for bit flags used in the ident_t to describe the fields.
	/// All enumeric elements are named and described in accordance with the code
	/// from http://llvm.org/svn/llvm-project/openmp/trunk/runtime/src/kmp.h
	enum OpenMPLocationFlags {
	/// \brief Use trampoline for internal microtask.
	OMP_IDENT_IMD = 0x01,
	/// \brief Use c-style ident structure.
	OMP_IDENT_KMPC = 0x02,
	/// \brief Atomic reduction option for kmpc_reduce.
	OMP_ATOMIC_REDUCE = 0x10,
	/// \brief Explicit 'barrier' directive.
	OMP_IDENT_BARRIER_EXPL = 0x20,
	/// \brief Implicit barrier in code.
	OMP_IDENT_BARRIER_IMPL = 0x40,
	/// \brief Implicit barrier in 'for' directive.
	OMP_IDENT_BARRIER_IMPL_FOR = 0x40,
	/// \brief Implicit barrier in 'sections' directive.
	OMP_IDENT_BARRIER_IMPL_SECTIONS = 0xC0,
	/// \brief Implicit barrier in 'single' directive.
	OMP_IDENT_BARRIER_IMPL_SINGLE = 0x140
	};

	/// \brief Describes ident structure that describes a source location.
	/// All descriptions are taken from
	/// http://llvm.org/svn/llvm-project/openmp/trunk/runtime/src/kmp.h
	/// Original structure:
	/// typedef struct ident {
	/// kmp_int32 reserved_1; /**< might be used in Fortran;
	/// see above */
	/// kmp_int32 flags; /**< also f.flags; KMP_IDENT_xxx flags;
	/// KMP_IDENT_KMPC identifies this union
	/// member */
	/// kmp_int32 reserved_2; /**< not really used in Fortran any more;
	/// see above */
	///#if USE_ITT_BUILD
	/// /* but currently used for storing
	/// region-specific ITT */
	/// /* contextual information. */
	///#endif /* USE_ITT_BUILD */
	/// kmp_int32 reserved_3; /**< source[4] in Fortran, do not use for
	/// C++ */
	/// char const psource; /*< String describing the source location.
	/// The string is composed of semi-colon separated
	// fields which describe the source file,
	/// the function and a pair of line numbers that
	/// delimit the construct.
	/// */
	/// } ident_t;
	enum IdentFieldIndex {
	/// \brief might be used in Fortran
	IdentField_Reserved_1,
	/// \brief OMP_IDENT_xxx flags; OMP_IDENT_KMPC identifies this union member.
	IdentField_Flags,
	/// \brief Not really used in Fortran any more
	IdentField_Reserved_2,
	/// \brief Source[4] in Fortran, do not use for C++
	IdentField_Reserved_3,
	/// \brief String describing the source location. The string is composed of
	/// semi-colon separated fields which describe the source file, the function
	/// and a pair of line numbers that delimit the construct.
	IdentField_PSource
	};

	/// \brief Schedule types for 'omp for' loops (these enumerators are taken from
	/// the enum sched_type in kmp.h).
	enum OpenMPSchedType {
	/// \brief Lower bound for default (unordered) versions.
	OMP_sch_lower = 32,
	OMP_sch_static_chunked = 33,
	OMP_sch_static = 34,
	OMP_sch_dynamic_chunked = 35,
	OMP_sch_guided_chunked = 36,
	OMP_sch_runtime = 37,
	OMP_sch_auto = 38,
	/// static with chunk adjustment (e.g., simd)
	OMP_sch_static_balanced_chunked = 45,
	/// \brief Lower bound for 'ordered' versions.
	OMP_ord_lower = 64,
	OMP_ord_static_chunked = 65,
	OMP_ord_static = 66,
	OMP_ord_dynamic_chunked = 67,
	OMP_ord_guided_chunked = 68,
	OMP_ord_runtime = 69,
	OMP_ord_auto = 70,
	OMP_sch_default = OMP_sch_static,
	/// \brief dist_schedule types
	OMP_dist_sch_static_chunked = 91,
	OMP_dist_sch_static = 92,
	/// Support for OpenMP 4.5 monotonic and nonmonotonic schedule modifiers.
	/// Set if the monotonic schedule modifier was present.
	OMP_sch_modifier_monotonic = (1 << 29),
	/// Set if the nonmonotonic schedule modifier was present.
	OMP_sch_modifier_nonmonotonic = (1 << 30),
	};

	enum OpenMPRTLFunction {
	/// \brief Call to void __kmpc_fork_call(ident_t *loc, kmp_int32 argc,
	/// kmpc_micro microtask, ...);
	OMPRTL__kmpc_fork_call,
	/// \brief Call to void __kmpc_threadprivate_cached(ident_t loc,
	/// kmp_int32 global_tid, void data, size_t size, void **cache);
	OMPRTL__kmpc_threadprivate_cached,
	/// \brief Call to void __kmpc_threadprivate_register( ident_t *,
	/// void *data, kmpc_ctor ctor, kmpc_cctor cctor, kmpc_dtor dtor);
	OMPRTL__kmpc_threadprivate_register,
	// Call to __kmpc_int32 kmpc_global_thread_num(ident_t *loc);
	OMPRTL__kmpc_global_thread_num,
	// Call to void __kmpc_critical(ident_t *loc, kmp_int32 global_tid,
	// kmp_critical_name *crit);
	OMPRTL__kmpc_critical,
	// Call to void __kmpc_critical_with_hint(ident_t *loc, kmp_int32
	// global_tid, kmp_critical_name *crit, uintptr_t hint);
	OMPRTL__kmpc_critical_with_hint,
	// Call to void __kmpc_end_critical(ident_t *loc, kmp_int32 global_tid,
	// kmp_critical_name *crit);
	OMPRTL__kmpc_end_critical,
	// Call to kmp_int32 __kmpc_cancel_barrier(ident_t *loc, kmp_int32
	// global_tid);
	OMPRTL__kmpc_cancel_barrier,
	// Call to void __kmpc_barrier(ident_t *loc, kmp_int32 global_tid);
	OMPRTL__kmpc_barrier,
	// Call to void __kmpc_for_static_fini(ident_t *loc, kmp_int32 global_tid);
	OMPRTL__kmpc_for_static_fini,
	// Call to void __kmpc_serialized_parallel(ident_t *loc, kmp_int32
	// global_tid);
	OMPRTL__kmpc_serialized_parallel,
	// Call to void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32
	// global_tid);
	OMPRTL__kmpc_end_serialized_parallel,
	// Call to void __kmpc_push_num_threads(ident_t *loc, kmp_int32 global_tid,
	// kmp_int32 num_threads);
	OMPRTL__kmpc_push_num_threads,
	// Call to void __kmpc_flush(ident_t *loc);
	OMPRTL__kmpc_flush,
	// Call to kmp_int32 __kmpc_master(ident_t *, kmp_int32 global_tid);
	OMPRTL__kmpc_master,
	// Call to void __kmpc_end_master(ident_t *, kmp_int32 global_tid);
	OMPRTL__kmpc_end_master,
	// Call to kmp_int32 __kmpc_omp_taskyield(ident_t *, kmp_int32 global_tid,
	// int end_part);
	OMPRTL__kmpc_omp_taskyield,
	// Call to kmp_int32 __kmpc_single(ident_t *, kmp_int32 global_tid);
	OMPRTL__kmpc_single,
	// Call to void __kmpc_end_single(ident_t *, kmp_int32 global_tid);
	OMPRTL__kmpc_end_single,
	// Call to kmp_task_t * __kmpc_omp_task_alloc(ident_t *, kmp_int32 gtid,
	// kmp_int32 flags, size_t sizeof_kmp_task_t, size_t sizeof_shareds,
	// kmp_routine_entry_t *task_entry);
	OMPRTL__kmpc_omp_task_alloc,
	// Call to kmp_int32 __kmpc_omp_task(ident_t , kmp_int32 gtid, kmp_task_t
	// new_task);
	OMPRTL__kmpc_omp_task,
	// Call to void __kmpc_copyprivate(ident_t *loc, kmp_int32 global_tid,
	// size_t cpy_size, void cpy_data, void(cpy_func)(void , void ),
	// kmp_int32 didit);
	OMPRTL__kmpc_copyprivate,
	// Call to kmp_int32 __kmpc_reduce(ident_t *loc, kmp_int32 global_tid,
	// kmp_int32 num_vars, size_t reduce_size, void *reduce_data, void
	// (reduce_func)(void lhs_data, void rhs_data), kmp_critical_name lck);
	OMPRTL__kmpc_reduce,
	// Call to kmp_int32 __kmpc_reduce_nowait(ident_t *loc, kmp_int32
	// global_tid, kmp_int32 num_vars, size_t reduce_size, void *reduce_data,
	// void (reduce_func)(void lhs_data, void *rhs_data), kmp_critical_name
	// *lck);
	OMPRTL__kmpc_reduce_nowait,
	// Call to void __kmpc_end_reduce(ident_t *loc, kmp_int32 global_tid,
	// kmp_critical_name *lck);
	OMPRTL__kmpc_end_reduce,
	// Call to void __kmpc_end_reduce_nowait(ident_t *loc, kmp_int32 global_tid,
	// kmp_critical_name *lck);
	OMPRTL__kmpc_end_reduce_nowait,
	// Call to void __kmpc_omp_task_begin_if0(ident_t *, kmp_int32 gtid,
	// kmp_task_t * new_task);
	OMPRTL__kmpc_omp_task_begin_if0,
	// Call to void __kmpc_omp_task_complete_if0(ident_t *, kmp_int32 gtid,
	// kmp_task_t * new_task);
	OMPRTL__kmpc_omp_task_complete_if0,
	// Call to void __kmpc_ordered(ident_t *loc, kmp_int32 global_tid);
	OMPRTL__kmpc_ordered,
	// Call to void __kmpc_end_ordered(ident_t *loc, kmp_int32 global_tid);
	OMPRTL__kmpc_end_ordered,
	// Call to kmp_int32 __kmpc_omp_taskwait(ident_t *loc, kmp_int32
	// global_tid);
	OMPRTL__kmpc_omp_taskwait,
	// Call to void __kmpc_taskgroup(ident_t *loc, kmp_int32 global_tid);
	OMPRTL__kmpc_taskgroup,
	// Call to void __kmpc_end_taskgroup(ident_t *loc, kmp_int32 global_tid);
	OMPRTL__kmpc_end_taskgroup,
	// Call to void __kmpc_push_proc_bind(ident_t *loc, kmp_int32 global_tid,
	// int proc_bind);
	OMPRTL__kmpc_push_proc_bind,
	// Call to kmp_int32 __kmpc_omp_task_with_deps(ident_t *loc_ref, kmp_int32
	// gtid, kmp_task_t * new_task, kmp_int32 ndeps, kmp_depend_info_t
	// dep_list, kmp_int32 ndeps_noalias, kmp_depend_info_t noalias_dep_list);
	OMPRTL__kmpc_omp_task_with_deps,
	// Call to void __kmpc_omp_wait_deps(ident_t *loc_ref, kmp_int32
	// gtid, kmp_int32 ndeps, kmp_depend_info_t *dep_list, kmp_int32
	// ndeps_noalias, kmp_depend_info_t *noalias_dep_list);
	OMPRTL__kmpc_omp_wait_deps,
	// Call to kmp_int32 __kmpc_cancellationpoint(ident_t *loc, kmp_int32
	// global_tid, kmp_int32 cncl_kind);
	OMPRTL__kmpc_cancellationpoint,
	// Call to kmp_int32 __kmpc_cancel(ident_t *loc, kmp_int32 global_tid,
	// kmp_int32 cncl_kind);
	OMPRTL__kmpc_cancel,
	// Call to void __kmpc_push_num_teams(ident_t *loc, kmp_int32 global_tid,
	// kmp_int32 num_teams, kmp_int32 thread_limit);
	OMPRTL__kmpc_push_num_teams,
	// Call to void __kmpc_fork_teams(ident_t *loc, kmp_int32 argc, kmpc_micro
	// microtask, ...);
	OMPRTL__kmpc_fork_teams,
	// Call to void __kmpc_taskloop(ident_t loc, int gtid, kmp_task_t task, int
	// if_val, kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st, int nogroup, int
	// sched, kmp_uint64 grainsize, void *task_dup);
	OMPRTL__kmpc_taskloop,
	// Call to void __kmpc_doacross_init(ident_t *loc, kmp_int32 gtid, kmp_int32
	// num_dims, struct kmp_dim *dims);
	OMPRTL__kmpc_doacross_init,
	// Call to void __kmpc_doacross_fini(ident_t *loc, kmp_int32 gtid);
	OMPRTL__kmpc_doacross_fini,
	// Call to void __kmpc_doacross_post(ident_t *loc, kmp_int32 gtid, kmp_int64
	// *vec);
	OMPRTL__kmpc_doacross_post,
	// Call to void __kmpc_doacross_wait(ident_t *loc, kmp_int32 gtid, kmp_int64
	// *vec);
	OMPRTL__kmpc_doacross_wait,

	//
	// Offloading related calls
	//
	// Call to int32_t __tgt_target(int32_t device_id, void *host_ptr, int32_t
	// arg_num, void args_base, void args, size_t *arg_sizes, int32_t
	// *arg_types);
	OMPRTL__tgt_target,
	// Call to int32_t __tgt_target_teams(int32_t device_id, void *host_ptr,
	// int32_t arg_num, void args_base, void args, size_t *arg_sizes,
	// int32_t *arg_types, int32_t num_teams, int32_t thread_limit);
	OMPRTL__tgt_target_teams,
	// Call to void __tgt_register_lib(__tgt_bin_desc *desc);
	OMPRTL__tgt_register_lib,
	// Call to void __tgt_unregister_lib(__tgt_bin_desc *desc);
	OMPRTL__tgt_unregister_lib,
	// Call to void __tgt_target_data_begin(int32_t device_id, int32_t arg_num,
	// void args_base, void args, size_t arg_sizes, int32_t arg_types);
	OMPRTL__tgt_target_data_begin,
	// Call to void __tgt_target_data_end(int32_t device_id, int32_t arg_num,
	// void args_base, void args, size_t arg_sizes, int32_t arg_types);
	OMPRTL__tgt_target_data_end,
	// Call to void __tgt_target_data_update(int32_t device_id, int32_t arg_num,
	// void args_base, void args, size_t arg_sizes, int32_t arg_types);
	OMPRTL__tgt_target_data_update,
	};

	/// A basic class for pre\|post-action for advanced codegen sequence for OpenMP
	/// region.
	class CleanupTy final : public EHScopeStack::Cleanup {
	PrePostActionTy *Action;

	public:
	explicit CleanupTy(PrePostActionTy *Action) : Action(Action) {}
	void Emit(CodeGenFunction &CGF, Flags /flags/) override {
	if (!CGF.HaveInsertPoint())
	return;
	Action->Exit(CGF);
	}
	};

	} // anonymous namespace

	void RegionCodeGenTy::operator()(CodeGenFunction &CGF) const {
	CodeGenFunction::RunCleanupsScope Scope(CGF);
	if (PrePostAction) {
	CGF.EHStack.pushCleanup<CleanupTy>(NormalAndEHCleanup, PrePostAction);
	Callback(CodeGen, CGF, *PrePostAction);
	} else {
	PrePostActionTy Action;
	Callback(CodeGen, CGF, Action);
	}
	}

	LValue CGOpenMPRegionInfo::getThreadIDVariableLValue(CodeGenFunction &CGF) {
	return CGF.EmitLoadOfPointerLValue(
	CGF.GetAddrOfLocalVar(getThreadIDVariable()),
	getThreadIDVariable()->getType()->castAs<PointerType>());
	}

	void CGOpenMPRegionInfo::EmitBody(CodeGenFunction &CGF, const Stmt * /S/) {
	if (!CGF.HaveInsertPoint())
	return;
	// 1.2.2 OpenMP Language Terminology
	// Structured block - An executable statement with a single entry at the
	// top and a single exit at the bottom.
	// The point of exit cannot be a branch out of the structured block.
	// longjmp() and throw() must not violate the entry/exit criteria.
	CGF.EHStack.pushTerminate();
	CodeGen(CGF);
	CGF.EHStack.popTerminate();
	}

	LValue CGOpenMPTaskOutlinedRegionInfo::getThreadIDVariableLValue(
	CodeGenFunction &CGF) {
	return CGF.MakeAddrLValue(CGF.GetAddrOfLocalVar(getThreadIDVariable()),
	getThreadIDVariable()->getType(),
	AlignmentSource::Decl);
	}

	CGOpenMPRuntime::CGOpenMPRuntime(CodeGenModule &CGM)
	: CGM(CGM), OffloadEntriesInfoManager(CGM) {
	IdentTy = llvm::StructType::create(
	"ident_t", CGM.Int32Ty /* reserved_1 /, CGM.Int32Ty / flags */,
	CGM.Int32Ty /* reserved_2 /, CGM.Int32Ty / reserved_3 */,
	CGM.Int8PtrTy /* psource */, nullptr);
	KmpCriticalNameTy = llvm::ArrayType::get(CGM.Int32Ty, /NumElements/ 8);

	loadOffloadInfoMetadata();
	}

	void CGOpenMPRuntime::clear() {
	InternalVars.clear();
	}

	static llvm::Function *
	emitCombinerOrInitializer(CodeGenModule &CGM, QualType Ty,
	const Expr CombinerInitializer, const VarDecl In,
	const VarDecl *Out, bool IsCombiner) {
	// void .omp_combiner.(Ty in, Ty out);
	auto &C = CGM.getContext();
	QualType PtrTy = C.getPointerType(Ty).withRestrict();
	FunctionArgList Args;
	ImplicitParamDecl OmpOutParm(C, /DC=/nullptr, Out->getLocation(),
	/Id=/nullptr, PtrTy);
	ImplicitParamDecl OmpInParm(C, /DC=/nullptr, In->getLocation(),
	/Id=/nullptr, PtrTy);
	Args.push_back(&OmpOutParm);
	Args.push_back(&OmpInParm);
	auto &FnInfo =
	CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
	auto *FnTy = CGM.getTypes().GetFunctionType(FnInfo);
	auto *Fn = llvm::Function::Create(
	FnTy, llvm::GlobalValue::InternalLinkage,
	IsCombiner ? ".omp_combiner." : ".omp_initializer.", &CGM.getModule());
	CGM.SetInternalFunctionAttributes(/D=/nullptr, Fn, FnInfo);
	Fn->removeFnAttr(llvm::Attribute::NoInline);
	Fn->addFnAttr(llvm::Attribute::AlwaysInline);
	CodeGenFunction CGF(CGM);
	// Map "T omp_in;" variable to "*omp_in_parm" value in all expressions.
	// Map "T omp_out;" variable to "*omp_out_parm" value in all expressions.
	CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, FnInfo, Args);
	CodeGenFunction::OMPPrivateScope Scope(CGF);
	Address AddrIn = CGF.GetAddrOfLocalVar(&OmpInParm);
	Scope.addPrivate(In, [&CGF, AddrIn, PtrTy]() -> Address {
	return CGF.EmitLoadOfPointerLValue(AddrIn, PtrTy->castAs<PointerType>())
	.getAddress();
	});
	Address AddrOut = CGF.GetAddrOfLocalVar(&OmpOutParm);
	Scope.addPrivate(Out, [&CGF, AddrOut, PtrTy]() -> Address {
	return CGF.EmitLoadOfPointerLValue(AddrOut, PtrTy->castAs<PointerType>())
	.getAddress();
	});
	(void)Scope.Privatize();
	CGF.EmitIgnoredExpr(CombinerInitializer);
	Scope.ForceCleanup();
	CGF.FinishFunction();
	return Fn;
	}

	void CGOpenMPRuntime::emitUserDefinedReduction(
	CodeGenFunction CGF, const OMPDeclareReductionDecl D) {
	if (UDRMap.count(D) > 0)
	return;
	auto &C = CGM.getContext();
	if (!In \|\| !Out) {
	In = &C.Idents.get("omp_in");
	Out = &C.Idents.get("omp_out");
	}
	llvm::Function *Combiner = emitCombinerOrInitializer(
	CGM, D->getType(), D->getCombiner(), cast<VarDecl>(D->lookup(In).front()),
	cast<VarDecl>(D->lookup(Out).front()),
	/IsCombiner=/true);
	llvm::Function *Initializer = nullptr;
	if (auto *Init = D->getInitializer()) {
	if (!Priv \|\| !Orig) {
	Priv = &C.Idents.get("omp_priv");
	Orig = &C.Idents.get("omp_orig");
	}
	Initializer = emitCombinerOrInitializer(
	CGM, D->getType(), Init, cast<VarDecl>(D->lookup(Orig).front()),
	cast<VarDecl>(D->lookup(Priv).front()),
	/IsCombiner=/false);
	}
	UDRMap.insert(std::make_pair(D, std::make_pair(Combiner, Initializer)));
	if (CGF) {
	auto &Decls = FunctionUDRMap.FindAndConstruct(CGF->CurFn);
	Decls.second.push_back(D);
	}
	}

	std::pair<llvm::Function , llvm::Function >
	CGOpenMPRuntime::getUserDefinedReduction(const OMPDeclareReductionDecl *D) {
	auto I = UDRMap.find(D);
	if (I != UDRMap.end())
	return I->second;
	emitUserDefinedReduction(/CGF=/nullptr, D);
	return UDRMap.lookup(D);
	}

	// Layout information for ident_t.
	static CharUnits getIdentAlign(CodeGenModule &CGM) {
	return CGM.getPointerAlign();
	}
	static CharUnits getIdentSize(CodeGenModule &CGM) {
	assert((4 * CGM.getPointerSize()).isMultipleOf(CGM.getPointerAlign()));
	return CharUnits::fromQuantity(16) + CGM.getPointerSize();
	}
	static CharUnits getOffsetOfIdentField(IdentFieldIndex Field) {
	// All the fields except the last are i32, so this works beautifully.
	return unsigned(Field) * CharUnits::fromQuantity(4);
	}
	static Address createIdentFieldGEP(CodeGenFunction &CGF, Address Addr,
	IdentFieldIndex Field,
	const llvm::Twine &Name = "") {
	auto Offset = getOffsetOfIdentField(Field);
	return CGF.Builder.CreateStructGEP(Addr, Field, Offset, Name);
	}

	llvm::Value *CGOpenMPRuntime::emitParallelOrTeamsOutlinedFunction(
	const OMPExecutableDirective &D, const VarDecl *ThreadIDVar,
	OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) {
	assert(ThreadIDVar->getType()->isPointerType() &&
	"thread id variable must be of type kmp_int32 *");
	const CapturedStmt *CS = cast<CapturedStmt>(D.getAssociatedStmt());
	CodeGenFunction CGF(CGM, true);
	bool HasCancel = false;
	if (auto *OPD = dyn_cast<OMPParallelDirective>(&D))
	HasCancel = OPD->hasCancel();
	else if (auto *OPSD = dyn_cast<OMPParallelSectionsDirective>(&D))
	HasCancel = OPSD->hasCancel();
	else if (auto *OPFD = dyn_cast<OMPParallelForDirective>(&D))
	HasCancel = OPFD->hasCancel();
	CGOpenMPOutlinedRegionInfo CGInfo(*CS, ThreadIDVar, CodeGen, InnermostKind,
	HasCancel, getOutlinedHelperName());
	CodeGenFunction::CGCapturedStmtRAII CapInfoRAII(CGF, &CGInfo);
	return CGF.GenerateOpenMPCapturedStmtFunction(*CS);
	}

	llvm::Value *CGOpenMPRuntime::emitTaskOutlinedFunction(
	const OMPExecutableDirective &D, const VarDecl *ThreadIDVar,
	const VarDecl PartIDVar, const VarDecl TaskTVar,
	OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen,
	bool Tied, unsigned &NumberOfParts) {
	auto &&UntiedCodeGen = [this, &D, TaskTVar](CodeGenFunction &CGF,
	PrePostActionTy &) {
	auto *ThreadID = getThreadID(CGF, D.getLocStart());
	auto *UpLoc = emitUpdateLocation(CGF, D.getLocStart());
	llvm::Value *TaskArgs[] = {
	UpLoc, ThreadID,
	CGF.EmitLoadOfPointerLValue(CGF.GetAddrOfLocalVar(TaskTVar),
	TaskTVar->getType()->castAs<PointerType>())
	.getPointer()};
	CGF.EmitRuntimeCall(createRuntimeFunction(OMPRTL__kmpc_omp_task), TaskArgs);
	};
	CGOpenMPTaskOutlinedRegionInfo::UntiedTaskActionTy Action(Tied, PartIDVar,
	UntiedCodeGen);
	CodeGen.setAction(Action);
	assert(!ThreadIDVar->getType()->isPointerType() &&
	"thread id variable must be of type kmp_int32 for tasks");
	auto *CS = cast<CapturedStmt>(D.getAssociatedStmt());
	auto *TD = dyn_cast<OMPTaskDirective>(&D);
	CodeGenFunction CGF(CGM, true);
	CGOpenMPTaskOutlinedRegionInfo CGInfo(*CS, ThreadIDVar, CodeGen,
	InnermostKind,
	TD ? TD->hasCancel() : false, Action);
	CodeGenFunction::CGCapturedStmtRAII CapInfoRAII(CGF, &CGInfo);
	auto Res = CGF.GenerateCapturedStmtFunction(CS);
	if (!Tied)
	NumberOfParts = Action.getNumberOfParts();
	return Res;
	}

	Address CGOpenMPRuntime::getOrCreateDefaultLocation(unsigned Flags) {
	CharUnits Align = getIdentAlign(CGM);
	llvm::Value *Entry = OpenMPDefaultLocMap.lookup(Flags);
	if (!Entry) {
	if (!DefaultOpenMPPSource) {
	// Initialize default location for psource field of ident_t structure of
	// all ident_t objects. Format is ";file;function;line;column;;".
	// Taken from
	// http://llvm.org/svn/llvm-project/openmp/trunk/runtime/src/kmp_str.c
	DefaultOpenMPPSource =
	CGM.GetAddrOfConstantCString(";unknown;unknown;0;0;;").getPointer();
	DefaultOpenMPPSource =
	llvm::ConstantExpr::getBitCast(DefaultOpenMPPSource, CGM.Int8PtrTy);
	}

	ConstantInitBuilder builder(CGM);
	auto fields = builder.beginStruct(IdentTy);
	fields.addInt(CGM.Int32Ty, 0);
	fields.addInt(CGM.Int32Ty, Flags);
	fields.addInt(CGM.Int32Ty, 0);
	fields.addInt(CGM.Int32Ty, 0);
	fields.add(DefaultOpenMPPSource);
	auto DefaultOpenMPLocation =
	fields.finishAndCreateGlobal("", Align, /isConstant/ true,
	llvm::GlobalValue::PrivateLinkage);
	DefaultOpenMPLocation->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global);

	OpenMPDefaultLocMap[Flags] = Entry = DefaultOpenMPLocation;
	}
	return Address(Entry, Align);
	}

	llvm::Value *CGOpenMPRuntime::emitUpdateLocation(CodeGenFunction &CGF,
	SourceLocation Loc,
	unsigned Flags) {
	Flags \|= OMP_IDENT_KMPC;
	// If no debug info is generated - return global default location.
	if (CGM.getCodeGenOpts().getDebugInfo() == codegenoptions::NoDebugInfo \|\|
	Loc.isInvalid())
	return getOrCreateDefaultLocation(Flags).getPointer();

	assert(CGF.CurFn && "No function in current CodeGenFunction.");

	Address LocValue = Address::invalid();
	auto I = OpenMPLocThreadIDMap.find(CGF.CurFn);
	if (I != OpenMPLocThreadIDMap.end())
	LocValue = Address(I->second.DebugLoc, getIdentAlign(CGF.CGM));

	// OpenMPLocThreadIDMap may have null DebugLoc and non-null ThreadID, if
	// GetOpenMPThreadID was called before this routine.
	if (!LocValue.isValid()) {
	// Generate "ident_t .kmpc_loc.addr;"
	Address AI = CGF.CreateTempAlloca(IdentTy, getIdentAlign(CGF.CGM),
	".kmpc_loc.addr");
	auto &Elem = OpenMPLocThreadIDMap.FindAndConstruct(CGF.CurFn);
	Elem.second.DebugLoc = AI.getPointer();
	LocValue = AI;

	CGBuilderTy::InsertPointGuard IPG(CGF.Builder);
	CGF.Builder.SetInsertPoint(CGF.AllocaInsertPt);
	CGF.Builder.CreateMemCpy(LocValue, getOrCreateDefaultLocation(Flags),
	CGM.getSize(getIdentSize(CGF.CGM)));
	}

	// char **psource = &.kmpc_loc_<flags>.addr.psource;
	Address PSource = createIdentFieldGEP(CGF, LocValue, IdentField_PSource);

	auto OMPDebugLoc = OpenMPDebugLocMap.lookup(Loc.getRawEncoding());
	if (OMPDebugLoc == nullptr) {
	SmallString<128> Buffer2;
	llvm::raw_svector_ostream OS2(Buffer2);
	// Build debug location
	PresumedLoc PLoc = CGF.getContext().getSourceManager().getPresumedLoc(Loc);
	OS2 << ";" << PLoc.getFilename() << ";";
	if (const FunctionDecl *FD =
	dyn_cast_or_null<FunctionDecl>(CGF.CurFuncDecl)) {
	OS2 << FD->getQualifiedNameAsString();
	}
	OS2 << ";" << PLoc.getLine() << ";" << PLoc.getColumn() << ";;";
	OMPDebugLoc = CGF.Builder.CreateGlobalStringPtr(OS2.str());
	OpenMPDebugLocMap[Loc.getRawEncoding()] = OMPDebugLoc;
	}
	// *psource = ";<File>;<Function>;<Line>;<Column>;;";
	CGF.Builder.CreateStore(OMPDebugLoc, PSource);

	// Our callers always pass this to a runtime function, so for
	// convenience, go ahead and return a naked pointer.
	return LocValue.getPointer();
	}

	llvm::Value *CGOpenMPRuntime::getThreadID(CodeGenFunction &CGF,
	SourceLocation Loc) {
	assert(CGF.CurFn && "No function in current CodeGenFunction.");

	llvm::Value *ThreadID = nullptr;
	// Check whether we've already cached a load of the thread id in this
	// function.
	auto I = OpenMPLocThreadIDMap.find(CGF.CurFn);
	if (I != OpenMPLocThreadIDMap.end()) {
	ThreadID = I->second.ThreadID;
	if (ThreadID != nullptr)
	return ThreadID;
	}
	if (auto *OMPRegionInfo =
	dyn_cast_or_null<CGOpenMPRegionInfo>(CGF.CapturedStmtInfo)) {
	if (OMPRegionInfo->getThreadIDVariable()) {
	// Check if this an outlined function with thread id passed as argument.
	auto LVal = OMPRegionInfo->getThreadIDVariableLValue(CGF);
	ThreadID = CGF.EmitLoadOfLValue(LVal, Loc).getScalarVal();
	// If value loaded in entry block, cache it and use it everywhere in
	// function.
	if (CGF.Builder.GetInsertBlock() == CGF.AllocaInsertPt->getParent()) {
	auto &Elem = OpenMPLocThreadIDMap.FindAndConstruct(CGF.CurFn);
	Elem.second.ThreadID = ThreadID;
	}
	return ThreadID;
	}
	}

	// This is not an outlined function region - need to call __kmpc_int32
	// kmpc_global_thread_num(ident_t *loc).
	// Generate thread id value and cache this value for use across the
	// function.
	CGBuilderTy::InsertPointGuard IPG(CGF.Builder);
	CGF.Builder.SetInsertPoint(CGF.AllocaInsertPt);
	ThreadID =
	CGF.EmitRuntimeCall(createRuntimeFunction(OMPRTL__kmpc_global_thread_num),
	emitUpdateLocation(CGF, Loc));
	auto &Elem = OpenMPLocThreadIDMap.FindAndConstruct(CGF.CurFn);
	Elem.second.ThreadID = ThreadID;
	return ThreadID;
	}

	void CGOpenMPRuntime::functionFinished(CodeGenFunction &CGF) {
	assert(CGF.CurFn && "No function in current CodeGenFunction.");
	if (OpenMPLocThreadIDMap.count(CGF.CurFn))
	OpenMPLocThreadIDMap.erase(CGF.CurFn);
	if (FunctionUDRMap.count(CGF.CurFn) > 0) {
	for(auto *D : FunctionUDRMap[CGF.CurFn]) {
	UDRMap.erase(D);
	}
	FunctionUDRMap.erase(CGF.CurFn);
	}
	}

	llvm::Type *CGOpenMPRuntime::getIdentTyPointerTy() {
	if (!IdentTy) {
	}
	return llvm::PointerType::getUnqual(IdentTy);
	}

	llvm::Type *CGOpenMPRuntime::getKmpc_MicroPointerTy() {
	if (!Kmpc_MicroTy) {
	// Build void (kmpc_micro)(kmp_int32 global_tid, kmp_int32 *bound_tid,...)
	llvm::Type *MicroParams[] = {llvm::PointerType::getUnqual(CGM.Int32Ty),
	llvm::PointerType::getUnqual(CGM.Int32Ty)};
	Kmpc_MicroTy = llvm::FunctionType::get(CGM.VoidTy, MicroParams, true);
	}
	return llvm::PointerType::getUnqual(Kmpc_MicroTy);
	}

	llvm::Constant *
	CGOpenMPRuntime::createRuntimeFunction(unsigned Function) {
	llvm::Constant *RTLFn = nullptr;
	switch (static_cast<OpenMPRTLFunction>(Function)) {
	case OMPRTL__kmpc_fork_call: {
	// Build void __kmpc_fork_call(ident_t *loc, kmp_int32 argc, kmpc_micro
	// microtask, ...);
	llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty,
	getKmpc_MicroPointerTy()};
	llvm::FunctionType *FnTy =
	llvm::FunctionType::get(CGM.VoidTy, TypeParams, /isVarArg/ true);
	RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_fork_call");
	break;
	}
	case OMPRTL__kmpc_global_thread_num: {
	// Build kmp_int32 __kmpc_global_thread_num(ident_t *loc);
	llvm::Type *TypeParams[] = {getIdentTyPointerTy()};
	llvm::FunctionType *FnTy =
	llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /isVarArg/ false);
	RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_global_thread_num");
	break;
	}
	case OMPRTL__kmpc_threadprivate_cached: {
	// Build void __kmpc_threadprivate_cached(ident_t loc,
	// kmp_int32 global_tid, void data, size_t size, void **cache);
	llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty,
	CGM.VoidPtrTy, CGM.SizeTy,
	CGM.VoidPtrTy->getPointerTo()->getPointerTo()};
	llvm::FunctionType *FnTy =
	llvm::FunctionType::get(CGM.VoidPtrTy, TypeParams, /isVarArg/ false);
	RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_threadprivate_cached");
	break;
	}
	case OMPRTL__kmpc_critical: {
	// Build void __kmpc_critical(ident_t *loc, kmp_int32 global_tid,
	// kmp_critical_name *crit);
	llvm::Type *TypeParams[] = {
	getIdentTyPointerTy(), CGM.Int32Ty,
	llvm::PointerType::getUnqual(KmpCriticalNameTy)};
	llvm::FunctionType *FnTy =
	llvm::FunctionType::get(CGM.VoidTy, TypeParams, /isVarArg/ false);
	RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_critical");
	break;
	}
	case OMPRTL__kmpc_critical_with_hint: {
	// Build void __kmpc_critical_with_hint(ident_t *loc, kmp_int32 global_tid,
	// kmp_critical_name *crit, uintptr_t hint);
	llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty,
	llvm::PointerType::getUnqual(KmpCriticalNameTy),
	CGM.IntPtrTy};
	llvm::FunctionType *FnTy =
	llvm::FunctionType::get(CGM.VoidTy, TypeParams, /isVarArg/ false);
	RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_critical_with_hint");
	break;
	}
	case OMPRTL__kmpc_threadprivate_register: {
	// Build void __kmpc_threadprivate_register(ident_t , void data,
	// kmpc_ctor ctor, kmpc_cctor cctor, kmpc_dtor dtor);
	// typedef void (kmpc_ctor)(void *);
	auto KmpcCtorTy =
	llvm::FunctionType::get(CGM.VoidPtrTy, CGM.VoidPtrTy,
	/isVarArg/ false)->getPointerTo();
	// typedef void (kmpc_cctor)(void , void );
	llvm::Type *KmpcCopyCtorTyArgs[] = {CGM.VoidPtrTy, CGM.VoidPtrTy};
	auto KmpcCopyCtorTy =
	llvm::FunctionType::get(CGM.VoidPtrTy, KmpcCopyCtorTyArgs,
	/isVarArg/ false)->getPointerTo();
	// typedef void (kmpc_dtor)(void );
	auto KmpcDtorTy =
	llvm::FunctionType::get(CGM.VoidTy, CGM.VoidPtrTy, /isVarArg/ false)
	->getPointerTo();
	llvm::Type *FnTyArgs[] = {getIdentTyPointerTy(), CGM.VoidPtrTy, KmpcCtorTy,
	KmpcCopyCtorTy, KmpcDtorTy};
	auto FnTy = llvm::FunctionType::get(CGM.VoidTy, FnTyArgs,
	/isVarArg/ false);
	RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_threadprivate_register");
	break;
	}
	case OMPRTL__kmpc_end_critical: {
	// Build void __kmpc_end_critical(ident_t *loc, kmp_int32 global_tid,
	// kmp_critical_name *crit);
	llvm::Type *TypeParams[] = {
	getIdentTyPointerTy(), CGM.Int32Ty,
	llvm::PointerType::getUnqual(KmpCriticalNameTy)};
	llvm::FunctionType *FnTy =
	llvm::FunctionType::get(CGM.VoidTy, TypeParams, /isVarArg/ false);
	RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_end_critical");
	break;
	}
	case OMPRTL__kmpc_cancel_barrier: {
	// Build kmp_int32 __kmpc_cancel_barrier(ident_t *loc, kmp_int32
	// global_tid);
	llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty};
	llvm::FunctionType *FnTy =
	llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /isVarArg/ false);
	RTLFn = CGM.CreateRuntimeFunction(FnTy, /Name/ "__kmpc_cancel_barrier");
	break;
	}
	case OMPRTL__kmpc_barrier: {
	// Build void __kmpc_barrier(ident_t *loc, kmp_int32 global_tid);
	llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty};
	llvm::FunctionType *FnTy =
	llvm::FunctionType::get(CGM.VoidTy, TypeParams, /isVarArg/ false);
	RTLFn = CGM.CreateRuntimeFunction(FnTy, /Name/ "__kmpc_barrier");
	break;
	}
	case OMPRTL__kmpc_for_static_fini: {
	// Build void __kmpc_for_static_fini(ident_t *loc, kmp_int32 global_tid);
	llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty};
	llvm::FunctionType *FnTy =
	llvm::FunctionType::get(CGM.VoidTy, TypeParams, /isVarArg/ false);
	RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_for_static_fini");
	break;
	}
	case OMPRTL__kmpc_push_num_threads: {
	// Build void __kmpc_push_num_threads(ident_t *loc, kmp_int32 global_tid,
	// kmp_int32 num_threads)
	llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty,
	CGM.Int32Ty};
	llvm::FunctionType *FnTy =
	llvm::FunctionType::get(CGM.VoidTy, TypeParams, /isVarArg/ false);
	RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_push_num_threads");
	break;
	}
	case OMPRTL__kmpc_serialized_parallel: {
	// Build void __kmpc_serialized_parallel(ident_t *loc, kmp_int32
	// global_tid);
	llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty};
	llvm::FunctionType *FnTy =
	llvm::FunctionType::get(CGM.VoidTy, TypeParams, /isVarArg/ false);
	RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_serialized_parallel");
	break;
	}
	case OMPRTL__kmpc_end_serialized_parallel: {
	// Build void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32
	// global_tid);
	llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty};
	llvm::FunctionType *FnTy =
	llvm::FunctionType::get(CGM.VoidTy, TypeParams, /isVarArg/ false);
	RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_end_serialized_parallel");
	break;
	}
	case OMPRTL__kmpc_flush: {
	// Build void __kmpc_flush(ident_t *loc);
	llvm::Type *TypeParams[] = {getIdentTyPointerTy()};
	llvm::FunctionType *FnTy =
	llvm::FunctionType::get(CGM.VoidTy, TypeParams, /isVarArg/ false);
	RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_flush");
	break;
	}
	case OMPRTL__kmpc_master: {
	// Build kmp_int32 __kmpc_master(ident_t *loc, kmp_int32 global_tid);
	llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty};
	llvm::FunctionType *FnTy =
	llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /isVarArg=/false);
	RTLFn = CGM.CreateRuntimeFunction(FnTy, /Name=/"__kmpc_master");
	break;
	}
	case OMPRTL__kmpc_end_master: {
	// Build void __kmpc_end_master(ident_t *loc, kmp_int32 global_tid);
	llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty};
	llvm::FunctionType *FnTy =
	llvm::FunctionType::get(CGM.VoidTy, TypeParams, /isVarArg=/false);
	RTLFn = CGM.CreateRuntimeFunction(FnTy, /Name=/"__kmpc_end_master");
	break;
	}
	case OMPRTL__kmpc_omp_taskyield: {
	// Build kmp_int32 __kmpc_omp_taskyield(ident_t *, kmp_int32 global_tid,
	// int end_part);
	llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty, CGM.IntTy};
	llvm::FunctionType *FnTy =
	llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /isVarArg=/false);
	RTLFn = CGM.CreateRuntimeFunction(FnTy, /Name=/"__kmpc_omp_taskyield");
	break;
	}
	case OMPRTL__kmpc_single: {
	// Build kmp_int32 __kmpc_single(ident_t *loc, kmp_int32 global_tid);
	llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty};
	llvm::FunctionType *FnTy =
	llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /isVarArg=/false);
	RTLFn = CGM.CreateRuntimeFunction(FnTy, /Name=/"__kmpc_single");
	break;
	}
	case OMPRTL__kmpc_end_single: {
	// Build void __kmpc_end_single(ident_t *loc, kmp_int32 global_tid);
	llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty};
	llvm::FunctionType *FnTy =
	llvm::FunctionType::get(CGM.VoidTy, TypeParams, /isVarArg=/false);
	RTLFn = CGM.CreateRuntimeFunction(FnTy, /Name=/"__kmpc_end_single");
	break;
	}
	case OMPRTL__kmpc_omp_task_alloc: {
	// Build kmp_task_t __kmpc_omp_task_alloc(ident_t , kmp_int32 gtid,
	// kmp_int32 flags, size_t sizeof_kmp_task_t, size_t sizeof_shareds,
	// kmp_routine_entry_t *task_entry);
	assert(KmpRoutineEntryPtrTy != nullptr &&
	"Type kmp_routine_entry_t must be created.");
	llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty, CGM.Int32Ty,
	CGM.SizeTy, CGM.SizeTy, KmpRoutineEntryPtrTy};
	// Return void * and then cast to particular kmp_task_t type.
	llvm::FunctionType *FnTy =
	llvm::FunctionType::get(CGM.VoidPtrTy, TypeParams, /isVarArg=/false);
	RTLFn = CGM.CreateRuntimeFunction(FnTy, /Name=/"__kmpc_omp_task_alloc");
	break;
	}
	case OMPRTL__kmpc_omp_task: {
	// Build kmp_int32 __kmpc_omp_task(ident_t *, kmp_int32 gtid, kmp_task_t
	// *new_task);
	llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty,
	CGM.VoidPtrTy};
	llvm::FunctionType *FnTy =
	llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /isVarArg=/false);
	RTLFn = CGM.CreateRuntimeFunction(FnTy, /Name=/"__kmpc_omp_task");
	break;
	}
	case OMPRTL__kmpc_copyprivate: {
	// Build void __kmpc_copyprivate(ident_t *loc, kmp_int32 global_tid,
	// size_t cpy_size, void cpy_data, void(cpy_func)(void , void ),
	// kmp_int32 didit);
	llvm::Type *CpyTypeParams[] = {CGM.VoidPtrTy, CGM.VoidPtrTy};
	auto *CpyFnTy =
	llvm::FunctionType::get(CGM.VoidTy, CpyTypeParams, /isVarArg=/false);
	llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty, CGM.SizeTy,
	CGM.VoidPtrTy, CpyFnTy->getPointerTo(),
	CGM.Int32Ty};
	llvm::FunctionType *FnTy =
	llvm::FunctionType::get(CGM.VoidTy, TypeParams, /isVarArg=/false);
	RTLFn = CGM.CreateRuntimeFunction(FnTy, /Name=/"__kmpc_copyprivate");
	break;
	}
	case OMPRTL__kmpc_reduce: {
	// Build kmp_int32 __kmpc_reduce(ident_t *loc, kmp_int32 global_tid,
	// kmp_int32 num_vars, size_t reduce_size, void *reduce_data, void
	// (reduce_func)(void lhs_data, void rhs_data), kmp_critical_name lck);
	llvm::Type *ReduceTypeParams[] = {CGM.VoidPtrTy, CGM.VoidPtrTy};
	auto *ReduceFnTy = llvm::FunctionType::get(CGM.VoidTy, ReduceTypeParams,
	/isVarArg=/false);
	llvm::Type *TypeParams[] = {
	getIdentTyPointerTy(), CGM.Int32Ty, CGM.Int32Ty, CGM.SizeTy,
	CGM.VoidPtrTy, ReduceFnTy->getPointerTo(),
	llvm::PointerType::getUnqual(KmpCriticalNameTy)};
	llvm::FunctionType *FnTy =
	llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /isVarArg=/false);
	RTLFn = CGM.CreateRuntimeFunction(FnTy, /Name=/"__kmpc_reduce");
	break;
	}
	case OMPRTL__kmpc_reduce_nowait: {
	// Build kmp_int32 __kmpc_reduce_nowait(ident_t *loc, kmp_int32
	// global_tid, kmp_int32 num_vars, size_t reduce_size, void *reduce_data,
	// void (reduce_func)(void lhs_data, void *rhs_data), kmp_critical_name
	// *lck);
	llvm::Type *ReduceTypeParams[] = {CGM.VoidPtrTy, CGM.VoidPtrTy};
	auto *ReduceFnTy = llvm::FunctionType::get(CGM.VoidTy, ReduceTypeParams,
	/isVarArg=/false);
	llvm::Type *TypeParams[] = {
	getIdentTyPointerTy(), CGM.Int32Ty, CGM.Int32Ty, CGM.SizeTy,
	CGM.VoidPtrTy, ReduceFnTy->getPointerTo(),
	llvm::PointerType::getUnqual(KmpCriticalNameTy)};
	llvm::FunctionType *FnTy =
	llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /isVarArg=/false);
	RTLFn = CGM.CreateRuntimeFunction(FnTy, /Name=/"__kmpc_reduce_nowait");
	break;
	}
	case OMPRTL__kmpc_end_reduce: {
	// Build void __kmpc_end_reduce(ident_t *loc, kmp_int32 global_tid,
	// kmp_critical_name *lck);
	llvm::Type *TypeParams[] = {
	getIdentTyPointerTy(), CGM.Int32Ty,
	llvm::PointerType::getUnqual(KmpCriticalNameTy)};
	llvm::FunctionType *FnTy =
	llvm::FunctionType::get(CGM.VoidTy, TypeParams, /isVarArg=/false);
	RTLFn = CGM.CreateRuntimeFunction(FnTy, /Name=/"__kmpc_end_reduce");
	break;
	}
	case OMPRTL__kmpc_end_reduce_nowait: {
	// Build __kmpc_end_reduce_nowait(ident_t *loc, kmp_int32 global_tid,
	// kmp_critical_name *lck);
	llvm::Type *TypeParams[] = {
	getIdentTyPointerTy(), CGM.Int32Ty,
	llvm::PointerType::getUnqual(KmpCriticalNameTy)};
	llvm::FunctionType *FnTy =
	llvm::FunctionType::get(CGM.VoidTy, TypeParams, /isVarArg=/false);
	RTLFn =
	CGM.CreateRuntimeFunction(FnTy, /Name=/"__kmpc_end_reduce_nowait");
	break;
	}
	case OMPRTL__kmpc_omp_task_begin_if0: {
	// Build void __kmpc_omp_task(ident_t *, kmp_int32 gtid, kmp_task_t
	// *new_task);
	llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty,
	CGM.VoidPtrTy};
	llvm::FunctionType *FnTy =
	llvm::FunctionType::get(CGM.VoidTy, TypeParams, /isVarArg=/false);
	RTLFn =
	CGM.CreateRuntimeFunction(FnTy, /Name=/"__kmpc_omp_task_begin_if0");
	break;
	}
	case OMPRTL__kmpc_omp_task_complete_if0: {
	// Build void __kmpc_omp_task(ident_t *, kmp_int32 gtid, kmp_task_t
	// *new_task);
	llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty,
	CGM.VoidPtrTy};
	llvm::FunctionType *FnTy =
	llvm::FunctionType::get(CGM.VoidTy, TypeParams, /isVarArg=/false);
	RTLFn = CGM.CreateRuntimeFunction(FnTy,
	/Name=/"__kmpc_omp_task_complete_if0");
	break;
	}
	case OMPRTL__kmpc_ordered: {
	// Build void __kmpc_ordered(ident_t *loc, kmp_int32 global_tid);
	llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty};
	llvm::FunctionType *FnTy =
	llvm::FunctionType::get(CGM.VoidTy, TypeParams, /isVarArg=/false);
	RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_ordered");
	break;
	}
	case OMPRTL__kmpc_end_ordered: {
	// Build void __kmpc_end_ordered(ident_t *loc, kmp_int32 global_tid);
	llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty};
	llvm::FunctionType *FnTy =
	llvm::FunctionType::get(CGM.VoidTy, TypeParams, /isVarArg=/false);
	RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_end_ordered");
	break;
	}
	case OMPRTL__kmpc_omp_taskwait: {
	// Build kmp_int32 __kmpc_omp_taskwait(ident_t *loc, kmp_int32 global_tid);
	llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty};
	llvm::FunctionType *FnTy =
	llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /isVarArg=/false);
	RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_omp_taskwait");
	break;
	}
	case OMPRTL__kmpc_taskgroup: {
	// Build void __kmpc_taskgroup(ident_t *loc, kmp_int32 global_tid);
	llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty};
	llvm::FunctionType *FnTy =
	llvm::FunctionType::get(CGM.VoidTy, TypeParams, /isVarArg=/false);
	RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_taskgroup");
	break;
	}
	case OMPRTL__kmpc_end_taskgroup: {
	// Build void __kmpc_end_taskgroup(ident_t *loc, kmp_int32 global_tid);
	llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty};
	llvm::FunctionType *FnTy =
	llvm::FunctionType::get(CGM.VoidTy, TypeParams, /isVarArg=/false);
	RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_end_taskgroup");
	break;
	}
	case OMPRTL__kmpc_push_proc_bind: {
	// Build void __kmpc_push_proc_bind(ident_t *loc, kmp_int32 global_tid,
	// int proc_bind)
	llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty, CGM.IntTy};
	llvm::FunctionType *FnTy =
	llvm::FunctionType::get(CGM.VoidTy, TypeParams, /isVarArg/ false);
	RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_push_proc_bind");
	break;
	}
	case OMPRTL__kmpc_omp_task_with_deps: {
	// Build kmp_int32 __kmpc_omp_task_with_deps(ident_t *, kmp_int32 gtid,
	// kmp_task_t new_task, kmp_int32 ndeps, kmp_depend_info_t dep_list,
	// kmp_int32 ndeps_noalias, kmp_depend_info_t *noalias_dep_list);
	llvm::Type *TypeParams[] = {
	getIdentTyPointerTy(), CGM.Int32Ty, CGM.VoidPtrTy, CGM.Int32Ty,
	CGM.VoidPtrTy, CGM.Int32Ty, CGM.VoidPtrTy};
	llvm::FunctionType *FnTy =
	llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /isVarArg=/false);
	RTLFn =
	CGM.CreateRuntimeFunction(FnTy, /Name=/"__kmpc_omp_task_with_deps");
	break;
	}
	case OMPRTL__kmpc_omp_wait_deps: {
	// Build void __kmpc_omp_wait_deps(ident_t *, kmp_int32 gtid,
	// kmp_int32 ndeps, kmp_depend_info_t *dep_list, kmp_int32 ndeps_noalias,
	// kmp_depend_info_t *noalias_dep_list);
	llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty,
	CGM.Int32Ty, CGM.VoidPtrTy,
	CGM.Int32Ty, CGM.VoidPtrTy};
	llvm::FunctionType *FnTy =
	llvm::FunctionType::get(CGM.VoidTy, TypeParams, /isVarArg=/false);
	RTLFn = CGM.CreateRuntimeFunction(FnTy, /Name=/"__kmpc_omp_wait_deps");
	break;
	}
	case OMPRTL__kmpc_cancellationpoint: {
	// Build kmp_int32 __kmpc_cancellationpoint(ident_t *loc, kmp_int32
	// global_tid, kmp_int32 cncl_kind)
	llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty, CGM.IntTy};
	llvm::FunctionType *FnTy =
	llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /isVarArg/ false);
	RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_cancellationpoint");
	break;
	}
	case OMPRTL__kmpc_cancel: {
	// Build kmp_int32 __kmpc_cancel(ident_t *loc, kmp_int32 global_tid,
	// kmp_int32 cncl_kind)
	llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty, CGM.IntTy};
	llvm::FunctionType *FnTy =
	llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /isVarArg/ false);
	RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_cancel");
	break;
	}
	case OMPRTL__kmpc_push_num_teams: {
	// Build void kmpc_push_num_teams (ident_t loc, kmp_int32 global_tid,
	// kmp_int32 num_teams, kmp_int32 num_threads)
	llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty, CGM.Int32Ty,
	CGM.Int32Ty};
	llvm::FunctionType *FnTy =
	llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /isVarArg/ false);
	RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_push_num_teams");
	break;
	}
	case OMPRTL__kmpc_fork_teams: {
	// Build void __kmpc_fork_teams(ident_t *loc, kmp_int32 argc, kmpc_micro
	// microtask, ...);
	llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty,
	getKmpc_MicroPointerTy()};
	llvm::FunctionType *FnTy =
	llvm::FunctionType::get(CGM.VoidTy, TypeParams, /isVarArg/ true);
	RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_fork_teams");
	break;
	}
	case OMPRTL__kmpc_taskloop: {
	// Build void __kmpc_taskloop(ident_t loc, int gtid, kmp_task_t task, int
	// if_val, kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st, int nogroup, int
	// sched, kmp_uint64 grainsize, void *task_dup);
	llvm::Type *TypeParams[] = {getIdentTyPointerTy(),
	CGM.IntTy,
	CGM.VoidPtrTy,
	CGM.IntTy,
	CGM.Int64Ty->getPointerTo(),
	CGM.Int64Ty->getPointerTo(),
	CGM.Int64Ty,
	CGM.IntTy,
	CGM.IntTy,
	CGM.Int64Ty,
	CGM.VoidPtrTy};
	llvm::FunctionType *FnTy =
	llvm::FunctionType::get(CGM.VoidTy, TypeParams, /isVarArg=/false);
	RTLFn = CGM.CreateRuntimeFunction(FnTy, /Name=/"__kmpc_taskloop");
	break;
	}
	case OMPRTL__kmpc_doacross_init: {
	// Build void __kmpc_doacross_init(ident_t *loc, kmp_int32 gtid, kmp_int32
	// num_dims, struct kmp_dim *dims);
	llvm::Type *TypeParams[] = {getIdentTyPointerTy(),
	CGM.Int32Ty,
	CGM.Int32Ty,
	CGM.VoidPtrTy};
	llvm::FunctionType *FnTy =
	llvm::FunctionType::get(CGM.VoidTy, TypeParams, /isVarArg=/false);
	RTLFn = CGM.CreateRuntimeFunction(FnTy, /Name=/"__kmpc_doacross_init");
	break;
	}
	case OMPRTL__kmpc_doacross_fini: {
	// Build void __kmpc_doacross_fini(ident_t *loc, kmp_int32 gtid);
	llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty};
	llvm::FunctionType *FnTy =
	llvm::FunctionType::get(CGM.VoidTy, TypeParams, /isVarArg=/false);
	RTLFn = CGM.CreateRuntimeFunction(FnTy, /Name=/"__kmpc_doacross_fini");
	break;
	}
	case OMPRTL__kmpc_doacross_post: {
	// Build void __kmpc_doacross_post(ident_t *loc, kmp_int32 gtid, kmp_int64
	// *vec);
	llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty,
	CGM.Int64Ty->getPointerTo()};
	llvm::FunctionType *FnTy =
	llvm::FunctionType::get(CGM.VoidTy, TypeParams, /isVarArg=/false);
	RTLFn = CGM.CreateRuntimeFunction(FnTy, /Name=/"__kmpc_doacross_post");
	break;
	}
	case OMPRTL__kmpc_doacross_wait: {
	// Build void __kmpc_doacross_wait(ident_t *loc, kmp_int32 gtid, kmp_int64
	// *vec);
	llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty,
	CGM.Int64Ty->getPointerTo()};
	llvm::FunctionType *FnTy =
	llvm::FunctionType::get(CGM.VoidTy, TypeParams, /isVarArg=/false);
	RTLFn = CGM.CreateRuntimeFunction(FnTy, /Name=/"__kmpc_doacross_wait");
	break;
	}
	case OMPRTL__tgt_target: {
	// Build int32_t __tgt_target(int32_t device_id, void *host_ptr, int32_t
	// arg_num, void args_base, void args, size_t *arg_sizes, int32_t
	// *arg_types);
	llvm::Type *TypeParams[] = {CGM.Int32Ty,
	CGM.VoidPtrTy,
	CGM.Int32Ty,
	CGM.VoidPtrPtrTy,
	CGM.VoidPtrPtrTy,
	CGM.SizeTy->getPointerTo(),
	CGM.Int32Ty->getPointerTo()};
	llvm::FunctionType *FnTy =
	llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /isVarArg/ false);
	RTLFn = CGM.CreateRuntimeFunction(FnTy, "__tgt_target");
	break;
	}
	case OMPRTL__tgt_target_teams: {
	// Build int32_t __tgt_target_teams(int32_t device_id, void *host_ptr,
	// int32_t arg_num, void args_base, void args, size_t *arg_sizes,
	// int32_t *arg_types, int32_t num_teams, int32_t thread_limit);
	llvm::Type *TypeParams[] = {CGM.Int32Ty,
	CGM.VoidPtrTy,
	CGM.Int32Ty,
	CGM.VoidPtrPtrTy,
	CGM.VoidPtrPtrTy,
	CGM.SizeTy->getPointerTo(),
	CGM.Int32Ty->getPointerTo(),
	CGM.Int32Ty,
	CGM.Int32Ty};
	llvm::FunctionType *FnTy =
	llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /isVarArg/ false);
	RTLFn = CGM.CreateRuntimeFunction(FnTy, "__tgt_target_teams");
	break;
	}
	case OMPRTL__tgt_register_lib: {
	// Build void __tgt_register_lib(__tgt_bin_desc *desc);
	QualType ParamTy =
	CGM.getContext().getPointerType(getTgtBinaryDescriptorQTy());
	llvm::Type *TypeParams[] = {CGM.getTypes().ConvertTypeForMem(ParamTy)};
	llvm::FunctionType *FnTy =
	llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /isVarArg/ false);
	RTLFn = CGM.CreateRuntimeFunction(FnTy, "__tgt_register_lib");
	break;
	}
	case OMPRTL__tgt_unregister_lib: {
	// Build void __tgt_unregister_lib(__tgt_bin_desc *desc);
	QualType ParamTy =
	CGM.getContext().getPointerType(getTgtBinaryDescriptorQTy());
	llvm::Type *TypeParams[] = {CGM.getTypes().ConvertTypeForMem(ParamTy)};
	llvm::FunctionType *FnTy =
	llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /isVarArg/ false);
	RTLFn = CGM.CreateRuntimeFunction(FnTy, "__tgt_unregister_lib");
	break;
	}
	case OMPRTL__tgt_target_data_begin: {
	// Build void __tgt_target_data_begin(int32_t device_id, int32_t arg_num,
	// void args_base, void args, size_t arg_sizes, int32_t arg_types);
	llvm::Type *TypeParams[] = {CGM.Int32Ty,
	CGM.Int32Ty,
	CGM.VoidPtrPtrTy,
	CGM.VoidPtrPtrTy,
	CGM.SizeTy->getPointerTo(),
	CGM.Int32Ty->getPointerTo()};
	llvm::FunctionType *FnTy =
	llvm::FunctionType::get(CGM.VoidTy, TypeParams, /isVarArg/ false);
	RTLFn = CGM.CreateRuntimeFunction(FnTy, "__tgt_target_data_begin");
	break;
	}
	case OMPRTL__tgt_target_data_end: {
	// Build void __tgt_target_data_end(int32_t device_id, int32_t arg_num,
	// void args_base, void args, size_t arg_sizes, int32_t arg_types);
	llvm::Type *TypeParams[] = {CGM.Int32Ty,
	CGM.Int32Ty,
	CGM.VoidPtrPtrTy,
	CGM.VoidPtrPtrTy,
	CGM.SizeTy->getPointerTo(),
	CGM.Int32Ty->getPointerTo()};
	llvm::FunctionType *FnTy =
	llvm::FunctionType::get(CGM.VoidTy, TypeParams, /isVarArg/ false);
	RTLFn = CGM.CreateRuntimeFunction(FnTy, "__tgt_target_data_end");
	break;
	}
	case OMPRTL__tgt_target_data_update: {
	// Build void __tgt_target_data_update(int32_t device_id, int32_t arg_num,
	// void args_base, void args, size_t arg_sizes, int32_t arg_types);
	llvm::Type *TypeParams[] = {CGM.Int32Ty,
	CGM.Int32Ty,
	CGM.VoidPtrPtrTy,
	CGM.VoidPtrPtrTy,
	CGM.SizeTy->getPointerTo(),
	CGM.Int32Ty->getPointerTo()};
	llvm::FunctionType *FnTy =
	llvm::FunctionType::get(CGM.VoidTy, TypeParams, /isVarArg/ false);
	RTLFn = CGM.CreateRuntimeFunction(FnTy, "__tgt_target_data_update");
	break;
	}
	}
	assert(RTLFn && "Unable to find OpenMP runtime function");
	return RTLFn;
	}

	llvm::Constant *CGOpenMPRuntime::createForStaticInitFunction(unsigned IVSize,
	bool IVSigned) {
	assert((IVSize == 32 \|\| IVSize == 64) &&
	"IV size is not compatible with the omp runtime");
	auto Name = IVSize == 32 ? (IVSigned ? "__kmpc_for_static_init_4"
	: "__kmpc_for_static_init_4u")
	: (IVSigned ? "__kmpc_for_static_init_8"
	: "__kmpc_for_static_init_8u");
	auto ITy = IVSize == 32 ? CGM.Int32Ty : CGM.Int64Ty;
	auto PtrTy = llvm::PointerType::getUnqual(ITy);
	llvm::Type *TypeParams[] = {
	getIdentTyPointerTy(), // loc
	CGM.Int32Ty, // tid
	CGM.Int32Ty, // schedtype
	llvm::PointerType::getUnqual(CGM.Int32Ty), // p_lastiter
	PtrTy, // p_lower
	PtrTy, // p_upper
	PtrTy, // p_stride
	ITy, // incr
	ITy // chunk
	};
	llvm::FunctionType *FnTy =
	llvm::FunctionType::get(CGM.VoidTy, TypeParams, /isVarArg/ false);
	return CGM.CreateRuntimeFunction(FnTy, Name);
	}

	llvm::Constant *CGOpenMPRuntime::createDispatchInitFunction(unsigned IVSize,
	bool IVSigned) {
	assert((IVSize == 32 \|\| IVSize == 64) &&
	"IV size is not compatible with the omp runtime");
	auto Name =
	IVSize == 32
	? (IVSigned ? "__kmpc_dispatch_init_4" : "__kmpc_dispatch_init_4u")
	: (IVSigned ? "__kmpc_dispatch_init_8" : "__kmpc_dispatch_init_8u");
	auto ITy = IVSize == 32 ? CGM.Int32Ty : CGM.Int64Ty;
	llvm::Type *TypeParams[] = { getIdentTyPointerTy(), // loc
	CGM.Int32Ty, // tid
	CGM.Int32Ty, // schedtype
	ITy, // lower
	ITy, // upper
	ITy, // stride
	ITy // chunk
	};
	llvm::FunctionType *FnTy =
	llvm::FunctionType::get(CGM.VoidTy, TypeParams, /isVarArg/ false);
	return CGM.CreateRuntimeFunction(FnTy, Name);
	}

	llvm::Constant *CGOpenMPRuntime::createDispatchFiniFunction(unsigned IVSize,
	bool IVSigned) {
	assert((IVSize == 32 \|\| IVSize == 64) &&
	"IV size is not compatible with the omp runtime");
	auto Name =
	IVSize == 32
	? (IVSigned ? "__kmpc_dispatch_fini_4" : "__kmpc_dispatch_fini_4u")
	: (IVSigned ? "__kmpc_dispatch_fini_8" : "__kmpc_dispatch_fini_8u");
	llvm::Type *TypeParams[] = {
	getIdentTyPointerTy(), // loc
	CGM.Int32Ty, // tid
	};
	llvm::FunctionType *FnTy =
	llvm::FunctionType::get(CGM.VoidTy, TypeParams, /isVarArg=/false);
	return CGM.CreateRuntimeFunction(FnTy, Name);
	}

	llvm::Constant *CGOpenMPRuntime::createDispatchNextFunction(unsigned IVSize,
	bool IVSigned) {
	assert((IVSize == 32 \|\| IVSize == 64) &&
	"IV size is not compatible with the omp runtime");
	auto Name =
	IVSize == 32
	? (IVSigned ? "__kmpc_dispatch_next_4" : "__kmpc_dispatch_next_4u")
	: (IVSigned ? "__kmpc_dispatch_next_8" : "__kmpc_dispatch_next_8u");
	auto ITy = IVSize == 32 ? CGM.Int32Ty : CGM.Int64Ty;
	auto PtrTy = llvm::PointerType::getUnqual(ITy);
	llvm::Type *TypeParams[] = {
	getIdentTyPointerTy(), // loc
	CGM.Int32Ty, // tid
	llvm::PointerType::getUnqual(CGM.Int32Ty), // p_lastiter
	PtrTy, // p_lower
	PtrTy, // p_upper
	PtrTy // p_stride
	};
	llvm::FunctionType *FnTy =
	llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /isVarArg/ false);
	return CGM.CreateRuntimeFunction(FnTy, Name);
	}

	llvm::Constant *
	CGOpenMPRuntime::getOrCreateThreadPrivateCache(const VarDecl *VD) {
	assert(!CGM.getLangOpts().OpenMPUseTLS \|\|
	!CGM.getContext().getTargetInfo().isTLSSupported());
	// Lookup the entry, lazily creating it if necessary.
	return getOrCreateInternalVariable(CGM.Int8PtrPtrTy,
	Twine(CGM.getMangledName(VD)) + ".cache.");
	}

	Address CGOpenMPRuntime::getAddrOfThreadPrivate(CodeGenFunction &CGF,
	const VarDecl *VD,
	Address VDAddr,
	SourceLocation Loc) {
	if (CGM.getLangOpts().OpenMPUseTLS &&
	CGM.getContext().getTargetInfo().isTLSSupported())
	return VDAddr;

	auto VarTy = VDAddr.getElementType();
	llvm::Value *Args[] = {emitUpdateLocation(CGF, Loc), getThreadID(CGF, Loc),
	CGF.Builder.CreatePointerCast(VDAddr.getPointer(),
	CGM.Int8PtrTy),
	CGM.getSize(CGM.GetTargetTypeStoreSize(VarTy)),
	getOrCreateThreadPrivateCache(VD)};
	return Address(CGF.EmitRuntimeCall(
	createRuntimeFunction(OMPRTL__kmpc_threadprivate_cached), Args),
	VDAddr.getAlignment());
	}

	void CGOpenMPRuntime::emitThreadPrivateVarInit(
	CodeGenFunction &CGF, Address VDAddr, llvm::Value *Ctor,
	llvm::Value CopyCtor, llvm::Value Dtor, SourceLocation Loc) {
	// Call kmp_int32 __kmpc_global_thread_num(&loc) to init OpenMP runtime
	// library.
	auto OMPLoc = emitUpdateLocation(CGF, Loc);
	CGF.EmitRuntimeCall(createRuntimeFunction(OMPRTL__kmpc_global_thread_num),
	OMPLoc);
	// Call __kmpc_threadprivate_register(&loc, &var, ctor, cctor/NULL/, dtor)
	// to register constructor/destructor for variable.
	llvm::Value *Args[] = {OMPLoc,
	CGF.Builder.CreatePointerCast(VDAddr.getPointer(),
	CGM.VoidPtrTy),
	Ctor, CopyCtor, Dtor};
	CGF.EmitRuntimeCall(
	createRuntimeFunction(OMPRTL__kmpc_threadprivate_register), Args);
	}

	llvm::Function *CGOpenMPRuntime::emitThreadPrivateVarDefinition(
	const VarDecl *VD, Address VDAddr, SourceLocation Loc,
	bool PerformInit, CodeGenFunction *CGF) {
	if (CGM.getLangOpts().OpenMPUseTLS &&
	CGM.getContext().getTargetInfo().isTLSSupported())
	return nullptr;

	VD = VD->getDefinition(CGM.getContext());
	if (VD && ThreadPrivateWithDefinition.count(VD) == 0) {
	ThreadPrivateWithDefinition.insert(VD);
	QualType ASTTy = VD->getType();

	llvm::Value Ctor = nullptr, CopyCtor = nullptr, *Dtor = nullptr;
	auto Init = VD->getAnyInitializer();
	if (CGM.getLangOpts().CPlusPlus && PerformInit) {
	// Generate function that re-emits the declaration's initializer into the
	// threadprivate copy of the variable VD
	CodeGenFunction CtorCGF(CGM);
	FunctionArgList Args;
	ImplicitParamDecl Dst(CGM.getContext(), /DC=/nullptr, SourceLocation(),
	/Id=/nullptr, CGM.getContext().VoidPtrTy);
	Args.push_back(&Dst);

	auto &FI = CGM.getTypes().arrangeBuiltinFunctionDeclaration(
	CGM.getContext().VoidPtrTy, Args);
	auto FTy = CGM.getTypes().GetFunctionType(FI);
	auto Fn = CGM.CreateGlobalInitOrDestructFunction(
	FTy, ".__kmpc_global_ctor_.", FI, Loc);
	CtorCGF.StartFunction(GlobalDecl(), CGM.getContext().VoidPtrTy, Fn, FI,
	Args, SourceLocation());
	auto ArgVal = CtorCGF.EmitLoadOfScalar(
	CtorCGF.GetAddrOfLocalVar(&Dst), /Volatile=/false,
	CGM.getContext().VoidPtrTy, Dst.getLocation());
	Address Arg = Address(ArgVal, VDAddr.getAlignment());
	Arg = CtorCGF.Builder.CreateElementBitCast(Arg,
	CtorCGF.ConvertTypeForMem(ASTTy));
	CtorCGF.EmitAnyExprToMem(Init, Arg, Init->getType().getQualifiers(),
	/IsInitializer=/true);
	ArgVal = CtorCGF.EmitLoadOfScalar(
	CtorCGF.GetAddrOfLocalVar(&Dst), /Volatile=/false,
	CGM.getContext().VoidPtrTy, Dst.getLocation());
	CtorCGF.Builder.CreateStore(ArgVal, CtorCGF.ReturnValue);
	CtorCGF.FinishFunction();
	Ctor = Fn;
	}
	if (VD->getType().isDestructedType() != QualType::DK_none) {
	// Generate function that emits destructor call for the threadprivate copy
	// of the variable VD
	CodeGenFunction DtorCGF(CGM);
	FunctionArgList Args;
	ImplicitParamDecl Dst(CGM.getContext(), /DC=/nullptr, SourceLocation(),
	/Id=/nullptr, CGM.getContext().VoidPtrTy);
	Args.push_back(&Dst);

	auto &FI = CGM.getTypes().arrangeBuiltinFunctionDeclaration(
	CGM.getContext().VoidTy, Args);
	auto FTy = CGM.getTypes().GetFunctionType(FI);
	auto Fn = CGM.CreateGlobalInitOrDestructFunction(
	FTy, ".__kmpc_global_dtor_.", FI, Loc);
	auto NL = ApplyDebugLocation::CreateEmpty(DtorCGF);
	DtorCGF.StartFunction(GlobalDecl(), CGM.getContext().VoidTy, Fn, FI, Args,
	SourceLocation());
	// Create a scope with an artificial location for the body of this function.
	auto AL = ApplyDebugLocation::CreateArtificial(DtorCGF);
	auto ArgVal = DtorCGF.EmitLoadOfScalar(
	DtorCGF.GetAddrOfLocalVar(&Dst),
	/Volatile=/false, CGM.getContext().VoidPtrTy, Dst.getLocation());
	DtorCGF.emitDestroy(Address(ArgVal, VDAddr.getAlignment()), ASTTy,
	DtorCGF.getDestroyer(ASTTy.isDestructedType()),
	DtorCGF.needsEHCleanup(ASTTy.isDestructedType()));
	DtorCGF.FinishFunction();
	Dtor = Fn;
	}
	// Do not emit init function if it is not required.
	if (!Ctor && !Dtor)
	return nullptr;

	llvm::Type *CopyCtorTyArgs[] = {CGM.VoidPtrTy, CGM.VoidPtrTy};
	auto CopyCtorTy =
	llvm::FunctionType::get(CGM.VoidPtrTy, CopyCtorTyArgs,
	/isVarArg=/false)->getPointerTo();
	// Copying constructor for the threadprivate variable.
	// Must be NULL - reserved by runtime, but currently it requires that this
	// parameter is always NULL. Otherwise it fires assertion.
	CopyCtor = llvm::Constant::getNullValue(CopyCtorTy);
	if (Ctor == nullptr) {
	auto CtorTy = llvm::FunctionType::get(CGM.VoidPtrTy, CGM.VoidPtrTy,
	/isVarArg=/false)->getPointerTo();
	Ctor = llvm::Constant::getNullValue(CtorTy);
	}
	if (Dtor == nullptr) {
	auto DtorTy = llvm::FunctionType::get(CGM.VoidTy, CGM.VoidPtrTy,
	/isVarArg=/false)->getPointerTo();
	Dtor = llvm::Constant::getNullValue(DtorTy);
	}
	if (!CGF) {
	auto InitFunctionTy =
	llvm::FunctionType::get(CGM.VoidTy, /isVarArg/ false);
	auto InitFunction = CGM.CreateGlobalInitOrDestructFunction(
	InitFunctionTy, ".__omp_threadprivate_init_.",
	CGM.getTypes().arrangeNullaryFunction());
	CodeGenFunction InitCGF(CGM);
	FunctionArgList ArgList;
	InitCGF.StartFunction(GlobalDecl(), CGM.getContext().VoidTy, InitFunction,
	CGM.getTypes().arrangeNullaryFunction(), ArgList,
	Loc);
	emitThreadPrivateVarInit(InitCGF, VDAddr, Ctor, CopyCtor, Dtor, Loc);
	InitCGF.FinishFunction();
	return InitFunction;
	}
	emitThreadPrivateVarInit(*CGF, VDAddr, Ctor, CopyCtor, Dtor, Loc);
	}
	return nullptr;
	}

	/// \brief Emits code for OpenMP 'if' clause using specified \a CodeGen
	/// function. Here is the logic:
	/// if (Cond) {
	/// ThenGen();
	/// } else {
	/// ElseGen();
	/// }
	void CGOpenMPRuntime::emitOMPIfClause(CodeGenFunction &CGF, const Expr *Cond,
	const RegionCodeGenTy &ThenGen,
	const RegionCodeGenTy &ElseGen) {
	CodeGenFunction::LexicalScope ConditionScope(CGF, Cond->getSourceRange());

	// If the condition constant folds and can be elided, try to avoid emitting
	// the condition and the dead arm of the if/else.
	bool CondConstant;
	if (CGF.ConstantFoldsToSimpleInteger(Cond, CondConstant)) {
	if (CondConstant)
	ThenGen(CGF);
	else
	ElseGen(CGF);
	return;
	}

	// Otherwise, the condition did not fold, or we couldn't elide it. Just
	// emit the conditional branch.
	auto ThenBlock = CGF.createBasicBlock("omp_if.then");
	auto ElseBlock = CGF.createBasicBlock("omp_if.else");
	auto ContBlock = CGF.createBasicBlock("omp_if.end");
	CGF.EmitBranchOnBoolExpr(Cond, ThenBlock, ElseBlock, /TrueCount=/0);

	// Emit the 'then' code.
	CGF.EmitBlock(ThenBlock);
	ThenGen(CGF);
	CGF.EmitBranch(ContBlock);
	// Emit the 'else' code if present.
	// There is no need to emit line number for unconditional branch.
	(void)ApplyDebugLocation::CreateEmpty(CGF);
	CGF.EmitBlock(ElseBlock);
	ElseGen(CGF);
	// There is no need to emit line number for unconditional branch.
	(void)ApplyDebugLocation::CreateEmpty(CGF);
	CGF.EmitBranch(ContBlock);
	// Emit the continuation block for code after the if.
	CGF.EmitBlock(ContBlock, /IsFinished=/true);
	}

	void CGOpenMPRuntime::emitParallelCall(CodeGenFunction &CGF, SourceLocation Loc,
	llvm::Value *OutlinedFn,
	ArrayRef<llvm::Value *> CapturedVars,
	const Expr *IfCond) {
	if (!CGF.HaveInsertPoint())
	return;
	auto *RTLoc = emitUpdateLocation(CGF, Loc);
	auto &&ThenGen = [OutlinedFn, CapturedVars, RTLoc](CodeGenFunction &CGF,
	PrePostActionTy &) {
	// Build call __kmpc_fork_call(loc, n, microtask, var1, .., varn);
	auto &RT = CGF.CGM.getOpenMPRuntime();
	llvm::Value *Args[] = {
	RTLoc,
	CGF.Builder.getInt32(CapturedVars.size()), // Number of captured vars
	CGF.Builder.CreateBitCast(OutlinedFn, RT.getKmpc_MicroPointerTy())};
	llvm::SmallVector<llvm::Value *, 16> RealArgs;
	RealArgs.append(std::begin(Args), std::end(Args));
	RealArgs.append(CapturedVars.begin(), CapturedVars.end());

	auto RTLFn = RT.createRuntimeFunction(OMPRTL__kmpc_fork_call);
	CGF.EmitRuntimeCall(RTLFn, RealArgs);
	};
	auto &&ElseGen = [OutlinedFn, CapturedVars, RTLoc, Loc](CodeGenFunction &CGF,
	PrePostActionTy &) {
	auto &RT = CGF.CGM.getOpenMPRuntime();
	auto ThreadID = RT.getThreadID(CGF, Loc);
	// Build calls:
	// __kmpc_serialized_parallel(&Loc, GTid);
	llvm::Value *Args[] = {RTLoc, ThreadID};
	CGF.EmitRuntimeCall(
	RT.createRuntimeFunction(OMPRTL__kmpc_serialized_parallel), Args);

	// OutlinedFn(&GTid, &zero, CapturedStruct);
	auto ThreadIDAddr = RT.emitThreadIDAddress(CGF, Loc);
	Address ZeroAddr =
	CGF.CreateTempAlloca(CGF.Int32Ty, CharUnits::fromQuantity(4),
	/Name/ ".zero.addr");
	CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/C/ 0));
	llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
	OutlinedFnArgs.push_back(ThreadIDAddr.getPointer());
	OutlinedFnArgs.push_back(ZeroAddr.getPointer());
	OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());
	CGF.EmitCallOrInvoke(OutlinedFn, OutlinedFnArgs);

	// __kmpc_end_serialized_parallel(&Loc, GTid);
	llvm::Value *EndArgs[] = {RT.emitUpdateLocation(CGF, Loc), ThreadID};
	CGF.EmitRuntimeCall(
	RT.createRuntimeFunction(OMPRTL__kmpc_end_serialized_parallel),
	EndArgs);
	};
	if (IfCond)
	emitOMPIfClause(CGF, IfCond, ThenGen, ElseGen);
	else {
	RegionCodeGenTy ThenRCG(ThenGen);
	ThenRCG(CGF);
	}
	}

	// If we're inside an (outlined) parallel region, use the region info's
	// thread-ID variable (it is passed in a first argument of the outlined function
	// as "kmp_int32 *gtid"). Otherwise, if we're not inside parallel region, but in
	// regular serial code region, get thread ID by calling kmp_int32
	// kmpc_global_thread_num(ident_t *loc), stash this thread ID in a temporary and
	// return the address of that temp.
	Address CGOpenMPRuntime::emitThreadIDAddress(CodeGenFunction &CGF,
	SourceLocation Loc) {
	if (auto *OMPRegionInfo =
	dyn_cast_or_null<CGOpenMPRegionInfo>(CGF.CapturedStmtInfo))
	if (OMPRegionInfo->getThreadIDVariable())
	return OMPRegionInfo->getThreadIDVariableLValue(CGF).getAddress();

	auto ThreadID = getThreadID(CGF, Loc);
	auto Int32Ty =
	CGF.getContext().getIntTypeForBitwidth(/DestWidth/ 32, /Signed/ true);
	auto ThreadIDTemp = CGF.CreateMemTemp(Int32Ty, /Name/ ".threadid_temp.");
	CGF.EmitStoreOfScalar(ThreadID,
	CGF.MakeAddrLValue(ThreadIDTemp, Int32Ty));

	return ThreadIDTemp;
	}

	llvm::Constant *
	CGOpenMPRuntime::getOrCreateInternalVariable(llvm::Type *Ty,
	const llvm::Twine &Name) {
	SmallString<256> Buffer;
	llvm::raw_svector_ostream Out(Buffer);
	Out << Name;
	auto RuntimeName = Out.str();
	auto &Elem = *InternalVars.insert(std::make_pair(RuntimeName, nullptr)).first;
	if (Elem.second) {
	assert(Elem.second->getType()->getPointerElementType() == Ty &&
	"OMP internal variable has different type than requested");
	return &*Elem.second;
	}

	return Elem.second = new llvm::GlobalVariable(
	CGM.getModule(), Ty, /IsConstant/ false,
	llvm::GlobalValue::CommonLinkage, llvm::Constant::getNullValue(Ty),
	Elem.first());
	}

	llvm::Value *CGOpenMPRuntime::getCriticalRegionLock(StringRef CriticalName) {
	llvm::Twine Name(".gomp_critical_user_", CriticalName);
	return getOrCreateInternalVariable(KmpCriticalNameTy, Name.concat(".var"));
	}

	namespace {
	/// Common pre(post)-action for different OpenMP constructs.
	class CommonActionTy final : public PrePostActionTy {
	llvm::Value *EnterCallee;
	ArrayRef<llvm::Value *> EnterArgs;
	llvm::Value *ExitCallee;
	ArrayRef<llvm::Value *> ExitArgs;
	bool Conditional;
	llvm::BasicBlock *ContBlock = nullptr;

	public:
	CommonActionTy(llvm::Value EnterCallee, ArrayRef<llvm::Value > EnterArgs,
	llvm::Value ExitCallee, ArrayRef<llvm::Value > ExitArgs,
	bool Conditional = false)
	: EnterCallee(EnterCallee), EnterArgs(EnterArgs), ExitCallee(ExitCallee),
	ExitArgs(ExitArgs), Conditional(Conditional) {}
	void Enter(CodeGenFunction &CGF) override {
	llvm::Value *EnterRes = CGF.EmitRuntimeCall(EnterCallee, EnterArgs);
	if (Conditional) {
	llvm::Value *CallBool = CGF.Builder.CreateIsNotNull(EnterRes);
	auto *ThenBlock = CGF.createBasicBlock("omp_if.then");
	ContBlock = CGF.createBasicBlock("omp_if.end");
	// Generate the branch (If-stmt)
	CGF.Builder.CreateCondBr(CallBool, ThenBlock, ContBlock);
	CGF.EmitBlock(ThenBlock);
	}
	}
	void Done(CodeGenFunction &CGF) {
	// Emit the rest of blocks/branches
	CGF.EmitBranch(ContBlock);
	CGF.EmitBlock(ContBlock, true);
	}
	void Exit(CodeGenFunction &CGF) override {
	CGF.EmitRuntimeCall(ExitCallee, ExitArgs);
	}
	};
	} // anonymous namespace

	void CGOpenMPRuntime::emitCriticalRegion(CodeGenFunction &CGF,
	StringRef CriticalName,
	const RegionCodeGenTy &CriticalOpGen,
	SourceLocation Loc, const Expr *Hint) {
	// __kmpc_critical[_with_hint](ident_t *, gtid, Lock[, hint]);
	// CriticalOpGen();
	// __kmpc_end_critical(ident_t *, gtid, Lock);
	// Prepare arguments and build a call to __kmpc_critical
	if (!CGF.HaveInsertPoint())
	return;
	llvm::Value *Args[] = {emitUpdateLocation(CGF, Loc), getThreadID(CGF, Loc),
	getCriticalRegionLock(CriticalName)};
	llvm::SmallVector<llvm::Value *, 4> EnterArgs(std::begin(Args),
	std::end(Args));
	if (Hint) {
	EnterArgs.push_back(CGF.Builder.CreateIntCast(
	CGF.EmitScalarExpr(Hint), CGM.IntPtrTy, /isSigned=/false));
	}
	CommonActionTy Action(
	createRuntimeFunction(Hint ? OMPRTL__kmpc_critical_with_hint
	: OMPRTL__kmpc_critical),
	EnterArgs, createRuntimeFunction(OMPRTL__kmpc_end_critical), Args);
	CriticalOpGen.setAction(Action);
	emitInlinedDirective(CGF, OMPD_critical, CriticalOpGen);
	}

	void CGOpenMPRuntime::emitMasterRegion(CodeGenFunction &CGF,
	const RegionCodeGenTy &MasterOpGen,
	SourceLocation Loc) {
	if (!CGF.HaveInsertPoint())
	return;
	// if(__kmpc_master(ident_t *, gtid)) {
	// MasterOpGen();
	// __kmpc_end_master(ident_t *, gtid);
	// }
	// Prepare arguments and build a call to __kmpc_master
	llvm::Value *Args[] = {emitUpdateLocation(CGF, Loc), getThreadID(CGF, Loc)};
	CommonActionTy Action(createRuntimeFunction(OMPRTL__kmpc_master), Args,
	createRuntimeFunction(OMPRTL__kmpc_end_master), Args,
	/Conditional=/true);
	MasterOpGen.setAction(Action);
	emitInlinedDirective(CGF, OMPD_master, MasterOpGen);
	Action.Done(CGF);
	}

	void CGOpenMPRuntime::emitTaskyieldCall(CodeGenFunction &CGF,
	SourceLocation Loc) {
	if (!CGF.HaveInsertPoint())
	return;
	// Build call __kmpc_omp_taskyield(loc, thread_id, 0);
	llvm::Value *Args[] = {
	emitUpdateLocation(CGF, Loc), getThreadID(CGF, Loc),
	llvm::ConstantInt::get(CGM.IntTy, /V=/0, /isSigned=/true)};
	CGF.EmitRuntimeCall(createRuntimeFunction(OMPRTL__kmpc_omp_taskyield), Args);
	if (auto *Region = dyn_cast_or_null<CGOpenMPRegionInfo>(CGF.CapturedStmtInfo))
	Region->emitUntiedSwitch(CGF);
	}

	void CGOpenMPRuntime::emitTaskgroupRegion(CodeGenFunction &CGF,
	const RegionCodeGenTy &TaskgroupOpGen,
	SourceLocation Loc) {
	if (!CGF.HaveInsertPoint())
	return;
	// __kmpc_taskgroup(ident_t *, gtid);
	// TaskgroupOpGen();
	// __kmpc_end_taskgroup(ident_t *, gtid);
	// Prepare arguments and build a call to __kmpc_taskgroup
	llvm::Value *Args[] = {emitUpdateLocation(CGF, Loc), getThreadID(CGF, Loc)};
	CommonActionTy Action(createRuntimeFunction(OMPRTL__kmpc_taskgroup), Args,
	createRuntimeFunction(OMPRTL__kmpc_end_taskgroup),
	Args);
	TaskgroupOpGen.setAction(Action);
	emitInlinedDirective(CGF, OMPD_taskgroup, TaskgroupOpGen);
	}

	/// Given an array of pointers to variables, project the address of a
	/// given variable.
	static Address emitAddrOfVarFromArray(CodeGenFunction &CGF, Address Array,
	unsigned Index, const VarDecl *Var) {
	// Pull out the pointer to the variable.
	Address PtrAddr =
	CGF.Builder.CreateConstArrayGEP(Array, Index, CGF.getPointerSize());
	llvm::Value *Ptr = CGF.Builder.CreateLoad(PtrAddr);

	Address Addr = Address(Ptr, CGF.getContext().getDeclAlign(Var));
	Addr = CGF.Builder.CreateElementBitCast(
	Addr, CGF.ConvertTypeForMem(Var->getType()));
	return Addr;
	}

	static llvm::Value *emitCopyprivateCopyFunction(
	CodeGenModule &CGM, llvm::Type *ArgsType,
	ArrayRef<const Expr > CopyprivateVars, ArrayRef<const Expr > DestExprs,
	ArrayRef<const Expr > SrcExprs, ArrayRef<const Expr > AssignmentOps) {
	auto &C = CGM.getContext();
	// void copy_func(void LHSArg, void RHSArg);
	FunctionArgList Args;
	ImplicitParamDecl LHSArg(C, /DC=/nullptr, SourceLocation(), /Id=/nullptr,
	C.VoidPtrTy);
	ImplicitParamDecl RHSArg(C, /DC=/nullptr, SourceLocation(), /Id=/nullptr,
	C.VoidPtrTy);
	Args.push_back(&LHSArg);
	Args.push_back(&RHSArg);
	auto &CGFI = CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
	auto *Fn = llvm::Function::Create(
	CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
	".omp.copyprivate.copy_func", &CGM.getModule());
	CGM.SetInternalFunctionAttributes(/D=/nullptr, Fn, CGFI);
	CodeGenFunction CGF(CGM);
	CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args);
	// Dest = (void*[n])(LHSArg);
	// Src = (void*[n])(RHSArg);
	Address LHS(CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
	CGF.Builder.CreateLoad(CGF.GetAddrOfLocalVar(&LHSArg)),
	ArgsType), CGF.getPointerAlign());
	Address RHS(CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
	CGF.Builder.CreateLoad(CGF.GetAddrOfLocalVar(&RHSArg)),
	ArgsType), CGF.getPointerAlign());
	// (Type0)Dst[0] = (Type0)Src[0];
	// (Type1)Dst[1] = (Type1)Src[1];
	// ...
	// (Typen)Dst[n] = (Typen)Src[n];
	for (unsigned I = 0, E = AssignmentOps.size(); I < E; ++I) {
	auto DestVar = cast<VarDecl>(cast<DeclRefExpr>(DestExprs[I])->getDecl());
	Address DestAddr = emitAddrOfVarFromArray(CGF, LHS, I, DestVar);

	auto SrcVar = cast<VarDecl>(cast<DeclRefExpr>(SrcExprs[I])->getDecl());
	Address SrcAddr = emitAddrOfVarFromArray(CGF, RHS, I, SrcVar);

	auto *VD = cast<DeclRefExpr>(CopyprivateVars[I])->getDecl();
	QualType Type = VD->getType();
	CGF.EmitOMPCopy(Type, DestAddr, SrcAddr, DestVar, SrcVar, AssignmentOps[I]);
	}
	CGF.FinishFunction();
	return Fn;
	}

	void CGOpenMPRuntime::emitSingleRegion(CodeGenFunction &CGF,
	const RegionCodeGenTy &SingleOpGen,
	SourceLocation Loc,
	ArrayRef<const Expr *> CopyprivateVars,
	ArrayRef<const Expr *> SrcExprs,
	ArrayRef<const Expr *> DstExprs,
	ArrayRef<const Expr *> AssignmentOps) {
	if (!CGF.HaveInsertPoint())
	return;
	assert(CopyprivateVars.size() == SrcExprs.size() &&
	CopyprivateVars.size() == DstExprs.size() &&
	CopyprivateVars.size() == AssignmentOps.size());
	auto &C = CGM.getContext();
	// int32 did_it = 0;
	// if(__kmpc_single(ident_t *, gtid)) {
	// SingleOpGen();
	// __kmpc_end_single(ident_t *, gtid);
	// did_it = 1;
	// }
	// call __kmpc_copyprivate(ident_t *, gtid, <buf_size>, <copyprivate list>,
	// <copy_func>, did_it);

	Address DidIt = Address::invalid();
	if (!CopyprivateVars.empty()) {
	// int32 did_it = 0;
	auto KmpInt32Ty = C.getIntTypeForBitwidth(/DestWidth=/32, /Signed=/1);
	DidIt = CGF.CreateMemTemp(KmpInt32Ty, ".omp.copyprivate.did_it");
	CGF.Builder.CreateStore(CGF.Builder.getInt32(0), DidIt);
	}
	// Prepare arguments and build a call to __kmpc_single
	llvm::Value *Args[] = {emitUpdateLocation(CGF, Loc), getThreadID(CGF, Loc)};
	CommonActionTy Action(createRuntimeFunction(OMPRTL__kmpc_single), Args,
	createRuntimeFunction(OMPRTL__kmpc_end_single), Args,
	/Conditional=/true);
	SingleOpGen.setAction(Action);
	emitInlinedDirective(CGF, OMPD_single, SingleOpGen);
	if (DidIt.isValid()) {
	// did_it = 1;
	CGF.Builder.CreateStore(CGF.Builder.getInt32(1), DidIt);
	}
	Action.Done(CGF);
	// call __kmpc_copyprivate(ident_t *, gtid, <buf_size>, <copyprivate list>,
	// <copy_func>, did_it);
	if (DidIt.isValid()) {
	llvm::APInt ArraySize(/unsigned int numBits=/32, CopyprivateVars.size());
	auto CopyprivateArrayTy =
	C.getConstantArrayType(C.VoidPtrTy, ArraySize, ArrayType::Normal,
	/IndexTypeQuals=/0);
	// Create a list of all private variables for copyprivate.
	Address CopyprivateList =
	CGF.CreateMemTemp(CopyprivateArrayTy, ".omp.copyprivate.cpr_list");
	for (unsigned I = 0, E = CopyprivateVars.size(); I < E; ++I) {
	Address Elem = CGF.Builder.CreateConstArrayGEP(
	CopyprivateList, I, CGF.getPointerSize());
	CGF.Builder.CreateStore(
	CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
	CGF.EmitLValue(CopyprivateVars[I]).getPointer(), CGF.VoidPtrTy),
	Elem);
	}
	// Build function that copies private values from single region to all other
	// threads in the corresponding parallel region.
	auto *CpyFn = emitCopyprivateCopyFunction(
	CGM, CGF.ConvertTypeForMem(CopyprivateArrayTy)->getPointerTo(),
	CopyprivateVars, SrcExprs, DstExprs, AssignmentOps);
	auto *BufSize = CGF.getTypeSize(CopyprivateArrayTy);
	Address CL =
	CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(CopyprivateList,
	CGF.VoidPtrTy);
	auto *DidItVal = CGF.Builder.CreateLoad(DidIt);
	llvm::Value *Args[] = {
	emitUpdateLocation(CGF, Loc), // ident_t *<loc>
	getThreadID(CGF, Loc), // i32 <gtid>
	BufSize, // size_t <buf_size>
	CL.getPointer(), // void *<copyprivate list>
	CpyFn, // void () (void , void *) <copy_func>
	DidItVal // i32 did_it
	};
	CGF.EmitRuntimeCall(createRuntimeFunction(OMPRTL__kmpc_copyprivate), Args);
	}
	}

	void CGOpenMPRuntime::emitOrderedRegion(CodeGenFunction &CGF,
	const RegionCodeGenTy &OrderedOpGen,
	SourceLocation Loc, bool IsThreads) {
	if (!CGF.HaveInsertPoint())
	return;
	// __kmpc_ordered(ident_t *, gtid);
	// OrderedOpGen();
	// __kmpc_end_ordered(ident_t *, gtid);
	// Prepare arguments and build a call to __kmpc_ordered
	if (IsThreads) {
	llvm::Value *Args[] = {emitUpdateLocation(CGF, Loc), getThreadID(CGF, Loc)};
	CommonActionTy Action(createRuntimeFunction(OMPRTL__kmpc_ordered), Args,
	createRuntimeFunction(OMPRTL__kmpc_end_ordered),
	Args);
	OrderedOpGen.setAction(Action);
	emitInlinedDirective(CGF, OMPD_ordered, OrderedOpGen);
	return;
	}
	emitInlinedDirective(CGF, OMPD_ordered, OrderedOpGen);
	}

	void CGOpenMPRuntime::emitBarrierCall(CodeGenFunction &CGF, SourceLocation Loc,
	OpenMPDirectiveKind Kind, bool EmitChecks,
	bool ForceSimpleCall) {
	if (!CGF.HaveInsertPoint())
	return;
	// Build call __kmpc_cancel_barrier(loc, thread_id);
	// Build call __kmpc_barrier(loc, thread_id);
	unsigned Flags;
	if (Kind == OMPD_for)
	Flags = OMP_IDENT_BARRIER_IMPL_FOR;
	else if (Kind == OMPD_sections)
	Flags = OMP_IDENT_BARRIER_IMPL_SECTIONS;
	else if (Kind == OMPD_single)
	Flags = OMP_IDENT_BARRIER_IMPL_SINGLE;
	else if (Kind == OMPD_barrier)
	Flags = OMP_IDENT_BARRIER_EXPL;
	else
	Flags = OMP_IDENT_BARRIER_IMPL;
	// Build call __kmpc_cancel_barrier(loc, thread_id) or __kmpc_barrier(loc,
	// thread_id);
	llvm::Value *Args[] = {emitUpdateLocation(CGF, Loc, Flags),
	getThreadID(CGF, Loc)};
	if (auto *OMPRegionInfo =
	dyn_cast_or_null<CGOpenMPRegionInfo>(CGF.CapturedStmtInfo)) {
	if (!ForceSimpleCall && OMPRegionInfo->hasCancel()) {
	auto *Result = CGF.EmitRuntimeCall(
	createRuntimeFunction(OMPRTL__kmpc_cancel_barrier), Args);
	if (EmitChecks) {
	// if (__kmpc_cancel_barrier()) {
	// exit from construct;
	// }
	auto *ExitBB = CGF.createBasicBlock(".cancel.exit");
	auto *ContBB = CGF.createBasicBlock(".cancel.continue");
	auto *Cmp = CGF.Builder.CreateIsNotNull(Result);
	CGF.Builder.CreateCondBr(Cmp, ExitBB, ContBB);
	CGF.EmitBlock(ExitBB);
	// exit from construct;
	auto CancelDestination =
	CGF.getOMPCancelDestination(OMPRegionInfo->getDirectiveKind());
	CGF.EmitBranchThroughCleanup(CancelDestination);
	CGF.EmitBlock(ContBB, /IsFinished=/true);
	}
	return;
	}
	}
	CGF.EmitRuntimeCall(createRuntimeFunction(OMPRTL__kmpc_barrier), Args);
	}

	/// \brief Map the OpenMP loop schedule to the runtime enumeration.
	static OpenMPSchedType getRuntimeSchedule(OpenMPScheduleClauseKind ScheduleKind,
	bool Chunked, bool Ordered) {
	switch (ScheduleKind) {
	case OMPC_SCHEDULE_static:
	return Chunked ? (Ordered ? OMP_ord_static_chunked : OMP_sch_static_chunked)
	: (Ordered ? OMP_ord_static : OMP_sch_static);
	case OMPC_SCHEDULE_dynamic:
	return Ordered ? OMP_ord_dynamic_chunked : OMP_sch_dynamic_chunked;
	case OMPC_SCHEDULE_guided:
	return Ordered ? OMP_ord_guided_chunked : OMP_sch_guided_chunked;
	case OMPC_SCHEDULE_runtime:
	return Ordered ? OMP_ord_runtime : OMP_sch_runtime;
	case OMPC_SCHEDULE_auto:
	return Ordered ? OMP_ord_auto : OMP_sch_auto;
	case OMPC_SCHEDULE_unknown:
	assert(!Chunked && "chunk was specified but schedule kind not known");
	return Ordered ? OMP_ord_static : OMP_sch_static;
	}
	llvm_unreachable("Unexpected runtime schedule");
	}

	/// \brief Map the OpenMP distribute schedule to the runtime enumeration.
	static OpenMPSchedType
	getRuntimeSchedule(OpenMPDistScheduleClauseKind ScheduleKind, bool Chunked) {
	// only static is allowed for dist_schedule
	return Chunked ? OMP_dist_sch_static_chunked : OMP_dist_sch_static;
	}

	bool CGOpenMPRuntime::isStaticNonchunked(OpenMPScheduleClauseKind ScheduleKind,
	bool Chunked) const {
	auto Schedule = getRuntimeSchedule(ScheduleKind, Chunked, /Ordered=/false);
	return Schedule == OMP_sch_static;
	}

	bool CGOpenMPRuntime::isStaticNonchunked(
	OpenMPDistScheduleClauseKind ScheduleKind, bool Chunked) const {
	auto Schedule = getRuntimeSchedule(ScheduleKind, Chunked);
	return Schedule == OMP_dist_sch_static;
	}


	bool CGOpenMPRuntime::isDynamic(OpenMPScheduleClauseKind ScheduleKind) const {
	auto Schedule =
	getRuntimeSchedule(ScheduleKind, /Chunked=/false, /Ordered=/false);
	assert(Schedule != OMP_sch_static_chunked && "cannot be chunked here");
	return Schedule != OMP_sch_static;
	}

	static int addMonoNonMonoModifier(OpenMPSchedType Schedule,
	OpenMPScheduleClauseModifier M1,
	OpenMPScheduleClauseModifier M2) {
	int Modifier = 0;
	switch (M1) {
	case OMPC_SCHEDULE_MODIFIER_monotonic:
	Modifier = OMP_sch_modifier_monotonic;
	break;
	case OMPC_SCHEDULE_MODIFIER_nonmonotonic:
	Modifier = OMP_sch_modifier_nonmonotonic;
	break;
	case OMPC_SCHEDULE_MODIFIER_simd:
	if (Schedule == OMP_sch_static_chunked)
	Schedule = OMP_sch_static_balanced_chunked;
	break;
	case OMPC_SCHEDULE_MODIFIER_last:
	case OMPC_SCHEDULE_MODIFIER_unknown:
	break;
	}
	switch (M2) {
	case OMPC_SCHEDULE_MODIFIER_monotonic:
	Modifier = OMP_sch_modifier_monotonic;
	break;
	case OMPC_SCHEDULE_MODIFIER_nonmonotonic:
	Modifier = OMP_sch_modifier_nonmonotonic;
	break;
	case OMPC_SCHEDULE_MODIFIER_simd:
	if (Schedule == OMP_sch_static_chunked)
	Schedule = OMP_sch_static_balanced_chunked;
	break;
	case OMPC_SCHEDULE_MODIFIER_last:
	case OMPC_SCHEDULE_MODIFIER_unknown:
	break;
	}
	return Schedule \| Modifier;
	}

	void CGOpenMPRuntime::emitForDispatchInit(CodeGenFunction &CGF,
	SourceLocation Loc,
	const OpenMPScheduleTy &ScheduleKind,
	unsigned IVSize, bool IVSigned,
	bool Ordered, llvm::Value *UB,
	llvm::Value *Chunk) {
	if (!CGF.HaveInsertPoint())
	return;
	OpenMPSchedType Schedule =
	getRuntimeSchedule(ScheduleKind.Schedule, Chunk != nullptr, Ordered);
	assert(Ordered \|\|
	(Schedule != OMP_sch_static && Schedule != OMP_sch_static_chunked &&
	Schedule != OMP_ord_static && Schedule != OMP_ord_static_chunked &&
	Schedule != OMP_sch_static_balanced_chunked));
	// Call __kmpc_dispatch_init(
	// ident_t *loc, kmp_int32 tid, kmp_int32 schedule,
	// kmp_int[32\|64] lower, kmp_int[32\|64] upper,
	// kmp_int[32\|64] stride, kmp_int[32\|64] chunk);

	// If the Chunk was not specified in the clause - use default value 1.
	if (Chunk == nullptr)
	Chunk = CGF.Builder.getIntN(IVSize, 1);
	llvm::Value *Args[] = {
	emitUpdateLocation(CGF, Loc), getThreadID(CGF, Loc),
	CGF.Builder.getInt32(addMonoNonMonoModifier(
	Schedule, ScheduleKind.M1, ScheduleKind.M2)), // Schedule type
	CGF.Builder.getIntN(IVSize, 0), // Lower
	UB, // Upper
	CGF.Builder.getIntN(IVSize, 1), // Stride
	Chunk // Chunk
	};
	CGF.EmitRuntimeCall(createDispatchInitFunction(IVSize, IVSigned), Args);
	}

	static void emitForStaticInitCall(
	CodeGenFunction &CGF, llvm::Value UpdateLocation, llvm::Value ThreadId,
	llvm::Constant *ForStaticInitFunction, OpenMPSchedType Schedule,
	OpenMPScheduleClauseModifier M1, OpenMPScheduleClauseModifier M2,
	unsigned IVSize, bool Ordered, Address IL, Address LB, Address UB,
	Address ST, llvm::Value *Chunk) {
	if (!CGF.HaveInsertPoint())
	return;

	assert(!Ordered);
	assert(Schedule == OMP_sch_static \|\| Schedule == OMP_sch_static_chunked \|\|
	Schedule == OMP_sch_static_balanced_chunked \|\|
	Schedule == OMP_ord_static \|\| Schedule == OMP_ord_static_chunked \|\|
	Schedule == OMP_dist_sch_static \|\|
	Schedule == OMP_dist_sch_static_chunked);

	// Call __kmpc_for_static_init(
	// ident_t *loc, kmp_int32 tid, kmp_int32 schedtype,
	// kmp_int32 p_lastiter, kmp_int[32\|64] p_lower,
	// kmp_int[32\|64] p_upper, kmp_int[32\|64] p_stride,
	// kmp_int[32\|64] incr, kmp_int[32\|64] chunk);
	if (Chunk == nullptr) {
	assert((Schedule == OMP_sch_static \|\| Schedule == OMP_ord_static \|\|
	Schedule == OMP_dist_sch_static) &&
	"expected static non-chunked schedule");
	// If the Chunk was not specified in the clause - use default value 1.
	Chunk = CGF.Builder.getIntN(IVSize, 1);
	} else {
	assert((Schedule == OMP_sch_static_chunked \|\|
	Schedule == OMP_sch_static_balanced_chunked \|\|
	Schedule == OMP_ord_static_chunked \|\|
	Schedule == OMP_dist_sch_static_chunked) &&
	"expected static chunked schedule");
	}
	llvm::Value *Args[] = {
	UpdateLocation, ThreadId, CGF.Builder.getInt32(addMonoNonMonoModifier(
	Schedule, M1, M2)), // Schedule type
	IL.getPointer(), // &isLastIter
	LB.getPointer(), // &LB
	UB.getPointer(), // &UB
	ST.getPointer(), // &Stride
	CGF.Builder.getIntN(IVSize, 1), // Incr
	Chunk // Chunk
	};
	CGF.EmitRuntimeCall(ForStaticInitFunction, Args);
	}

	void CGOpenMPRuntime::emitForStaticInit(CodeGenFunction &CGF,
	SourceLocation Loc,
	const OpenMPScheduleTy &ScheduleKind,
	unsigned IVSize, bool IVSigned,
	bool Ordered, Address IL, Address LB,
	Address UB, Address ST,
	llvm::Value *Chunk) {
	OpenMPSchedType ScheduleNum =
	getRuntimeSchedule(ScheduleKind.Schedule, Chunk != nullptr, Ordered);
	auto *UpdatedLocation = emitUpdateLocation(CGF, Loc);
	auto *ThreadId = getThreadID(CGF, Loc);
	auto *StaticInitFunction = createForStaticInitFunction(IVSize, IVSigned);
	emitForStaticInitCall(CGF, UpdatedLocation, ThreadId, StaticInitFunction,
	ScheduleNum, ScheduleKind.M1, ScheduleKind.M2, IVSize,
	Ordered, IL, LB, UB, ST, Chunk);
	}

	void CGOpenMPRuntime::emitDistributeStaticInit(
	CodeGenFunction &CGF, SourceLocation Loc,
	OpenMPDistScheduleClauseKind SchedKind, unsigned IVSize, bool IVSigned,
	bool Ordered, Address IL, Address LB, Address UB, Address ST,
	llvm::Value *Chunk) {
	OpenMPSchedType ScheduleNum = getRuntimeSchedule(SchedKind, Chunk != nullptr);
	auto *UpdatedLocation = emitUpdateLocation(CGF, Loc);
	auto *ThreadId = getThreadID(CGF, Loc);
	auto *StaticInitFunction = createForStaticInitFunction(IVSize, IVSigned);
	emitForStaticInitCall(CGF, UpdatedLocation, ThreadId, StaticInitFunction,
	ScheduleNum, OMPC_SCHEDULE_MODIFIER_unknown,
	OMPC_SCHEDULE_MODIFIER_unknown, IVSize, Ordered, IL, LB,
	UB, ST, Chunk);
	}

	void CGOpenMPRuntime::emitForStaticFinish(CodeGenFunction &CGF,
	SourceLocation Loc) {
	if (!CGF.HaveInsertPoint())
	return;
	// Call __kmpc_for_static_fini(ident_t *loc, kmp_int32 tid);
	llvm::Value *Args[] = {emitUpdateLocation(CGF, Loc), getThreadID(CGF, Loc)};
	CGF.EmitRuntimeCall(createRuntimeFunction(OMPRTL__kmpc_for_static_fini),
	Args);
	}

	void CGOpenMPRuntime::emitForOrderedIterationEnd(CodeGenFunction &CGF,
	SourceLocation Loc,
	unsigned IVSize,
	bool IVSigned) {
	if (!CGF.HaveInsertPoint())
	return;
	// Call __kmpc_for_dynamic_fini_(4\|8)[u](ident_t *loc, kmp_int32 tid);
	llvm::Value *Args[] = {emitUpdateLocation(CGF, Loc), getThreadID(CGF, Loc)};
	CGF.EmitRuntimeCall(createDispatchFiniFunction(IVSize, IVSigned), Args);
	}

	llvm::Value *CGOpenMPRuntime::emitForNext(CodeGenFunction &CGF,
	SourceLocation Loc, unsigned IVSize,
	bool IVSigned, Address IL,
	Address LB, Address UB,
	Address ST) {
	// Call __kmpc_dispatch_next(
	// ident_t loc, kmp_int32 tid, kmp_int32 p_lastiter,
	// kmp_int[32\|64] p_lower, kmp_int[32\|64] p_upper,
	// kmp_int[32\|64] *p_stride);
	llvm::Value *Args[] = {
	emitUpdateLocation(CGF, Loc),
	getThreadID(CGF, Loc),
	IL.getPointer(), // &isLastIter
	LB.getPointer(), // &Lower
	UB.getPointer(), // &Upper
	ST.getPointer() // &Stride
	};
	llvm::Value *Call =
	CGF.EmitRuntimeCall(createDispatchNextFunction(IVSize, IVSigned), Args);
	return CGF.EmitScalarConversion(
	Call, CGF.getContext().getIntTypeForBitwidth(32, /* Signed */ true),
	CGF.getContext().BoolTy, Loc);
	}

	void CGOpenMPRuntime::emitNumThreadsClause(CodeGenFunction &CGF,
	llvm::Value *NumThreads,
	SourceLocation Loc) {
	if (!CGF.HaveInsertPoint())
	return;
	// Build call __kmpc_push_num_threads(&loc, global_tid, num_threads)
	llvm::Value *Args[] = {
	emitUpdateLocation(CGF, Loc), getThreadID(CGF, Loc),
	CGF.Builder.CreateIntCast(NumThreads, CGF.Int32Ty, /isSigned/ true)};
	CGF.EmitRuntimeCall(createRuntimeFunction(OMPRTL__kmpc_push_num_threads),
	Args);
	}

	void CGOpenMPRuntime::emitProcBindClause(CodeGenFunction &CGF,
	OpenMPProcBindClauseKind ProcBind,
	SourceLocation Loc) {
	if (!CGF.HaveInsertPoint())
	return;
	// Constants for proc bind value accepted by the runtime.
	enum ProcBindTy {
	ProcBindFalse = 0,
	ProcBindTrue,
	ProcBindMaster,
	ProcBindClose,
	ProcBindSpread,
	ProcBindIntel,
	ProcBindDefault
	} RuntimeProcBind;
	switch (ProcBind) {
	case OMPC_PROC_BIND_master:
	RuntimeProcBind = ProcBindMaster;
	break;
	case OMPC_PROC_BIND_close:
	RuntimeProcBind = ProcBindClose;
	break;
	case OMPC_PROC_BIND_spread:
	RuntimeProcBind = ProcBindSpread;
	break;
	case OMPC_PROC_BIND_unknown:
	llvm_unreachable("Unsupported proc_bind value.");
	}
	// Build call __kmpc_push_proc_bind(&loc, global_tid, proc_bind)
	llvm::Value *Args[] = {
	emitUpdateLocation(CGF, Loc), getThreadID(CGF, Loc),
	llvm::ConstantInt::get(CGM.IntTy, RuntimeProcBind, /isSigned=/true)};
	CGF.EmitRuntimeCall(createRuntimeFunction(OMPRTL__kmpc_push_proc_bind), Args);
	}

	void CGOpenMPRuntime::emitFlush(CodeGenFunction &CGF, ArrayRef<const Expr *>,
	SourceLocation Loc) {
	if (!CGF.HaveInsertPoint())
	return;
	// Build call void __kmpc_flush(ident_t *loc)
	CGF.EmitRuntimeCall(createRuntimeFunction(OMPRTL__kmpc_flush),
	emitUpdateLocation(CGF, Loc));
	}

	namespace {
	/// \brief Indexes of fields for type kmp_task_t.
	enum KmpTaskTFields {
	/// \brief List of shared variables.
	KmpTaskTShareds,
	/// \brief Task routine.
	KmpTaskTRoutine,
	/// \brief Partition id for the untied tasks.
	KmpTaskTPartId,
	/// Function with call of destructors for private variables.
	Data1,
	/// Task priority.
	Data2,
	/// (Taskloops only) Lower bound.
	KmpTaskTLowerBound,
	/// (Taskloops only) Upper bound.
	KmpTaskTUpperBound,
	/// (Taskloops only) Stride.
	KmpTaskTStride,
	/// (Taskloops only) Is last iteration flag.
	KmpTaskTLastIter,
	};
	} // anonymous namespace

	bool CGOpenMPRuntime::OffloadEntriesInfoManagerTy::empty() const {
	// FIXME: Add other entries type when they become supported.
	return OffloadEntriesTargetRegion.empty();
	}

	/// \brief Initialize target region entry.
	void CGOpenMPRuntime::OffloadEntriesInfoManagerTy::
	initializeTargetRegionEntryInfo(unsigned DeviceID, unsigned FileID,
	StringRef ParentName, unsigned LineNum,
	unsigned Order) {
	assert(CGM.getLangOpts().OpenMPIsDevice && "Initialization of entries is "
	"only required for the device "
	"code generation.");
	OffloadEntriesTargetRegion[DeviceID][FileID][ParentName][LineNum] =
	OffloadEntryInfoTargetRegion(Order, /Addr=/nullptr, /ID=/nullptr,
	/Flags=/0);
	++OffloadingEntriesNum;
	}

	void CGOpenMPRuntime::OffloadEntriesInfoManagerTy::
	registerTargetRegionEntryInfo(unsigned DeviceID, unsigned FileID,
	StringRef ParentName, unsigned LineNum,
	llvm::Constant Addr, llvm::Constant ID,
	int32_t Flags) {
	// If we are emitting code for a target, the entry is already initialized,
	// only has to be registered.
	if (CGM.getLangOpts().OpenMPIsDevice) {
	assert(hasTargetRegionEntryInfo(DeviceID, FileID, ParentName, LineNum) &&
	"Entry must exist.");
	auto &Entry =
	OffloadEntriesTargetRegion[DeviceID][FileID][ParentName][LineNum];
	assert(Entry.isValid() && "Entry not initialized!");
	Entry.setAddress(Addr);
	Entry.setID(ID);
	Entry.setFlags(Flags);
	return;
	} else {
	OffloadEntryInfoTargetRegion Entry(OffloadingEntriesNum++, Addr, ID, Flags);
	OffloadEntriesTargetRegion[DeviceID][FileID][ParentName][LineNum] = Entry;
	}
	}

	bool CGOpenMPRuntime::OffloadEntriesInfoManagerTy::hasTargetRegionEntryInfo(
	unsigned DeviceID, unsigned FileID, StringRef ParentName,
	unsigned LineNum) const {
	auto PerDevice = OffloadEntriesTargetRegion.find(DeviceID);
	if (PerDevice == OffloadEntriesTargetRegion.end())
	return false;
	auto PerFile = PerDevice->second.find(FileID);
	if (PerFile == PerDevice->second.end())
	return false;
	auto PerParentName = PerFile->second.find(ParentName);
	if (PerParentName == PerFile->second.end())
	return false;
	auto PerLine = PerParentName->second.find(LineNum);
	if (PerLine == PerParentName->second.end())
	return false;
	// Fail if this entry is already registered.
	if (PerLine->second.getAddress() \|\| PerLine->second.getID())
	return false;
	return true;
	}

	void CGOpenMPRuntime::OffloadEntriesInfoManagerTy::actOnTargetRegionEntriesInfo(
	const OffloadTargetRegionEntryInfoActTy &Action) {
	// Scan all target region entries and perform the provided action.
	for (auto &D : OffloadEntriesTargetRegion)
	for (auto &F : D.second)
	for (auto &P : F.second)
	for (auto &L : P.second)
	Action(D.first, F.first, P.first(), L.first, L.second);
	}

	/// \brief Create a Ctor/Dtor-like function whose body is emitted through
	/// \a Codegen. This is used to emit the two functions that register and
	/// unregister the descriptor of the current compilation unit.
	static llvm::Function *
	createOffloadingBinaryDescriptorFunction(CodeGenModule &CGM, StringRef Name,
	const RegionCodeGenTy &Codegen) {
	auto &C = CGM.getContext();
	FunctionArgList Args;
	ImplicitParamDecl DummyPtr(C, /DC=/nullptr, SourceLocation(),
	/Id=/nullptr, C.VoidPtrTy);
	Args.push_back(&DummyPtr);

	CodeGenFunction CGF(CGM);
	auto &FI = CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
	auto FTy = CGM.getTypes().GetFunctionType(FI);
	auto *Fn =
	CGM.CreateGlobalInitOrDestructFunction(FTy, Name, FI, SourceLocation());
	CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, FI, Args, SourceLocation());
	Codegen(CGF);
	CGF.FinishFunction();
	return Fn;
	}

	llvm::Function *
	CGOpenMPRuntime::createOffloadingBinaryDescriptorRegistration() {

	// If we don't have entries or if we are emitting code for the device, we
	// don't need to do anything.
	if (CGM.getLangOpts().OpenMPIsDevice \|\| OffloadEntriesInfoManager.empty())
	return nullptr;

	auto &M = CGM.getModule();
	auto &C = CGM.getContext();

	// Get list of devices we care about
	auto &Devices = CGM.getLangOpts().OMPTargetTriples;

	// We should be creating an offloading descriptor only if there are devices
	// specified.
	assert(!Devices.empty() && "No OpenMP offloading devices??");

	// Create the external variables that will point to the begin and end of the
	// host entries section. These will be defined by the linker.
	auto *OffloadEntryTy =
	CGM.getTypes().ConvertTypeForMem(getTgtOffloadEntryQTy());
	llvm::GlobalVariable *HostEntriesBegin = new llvm::GlobalVariable(
	M, OffloadEntryTy, /isConstant=/true,
	llvm::GlobalValue::ExternalLinkage, /Initializer=/nullptr,
	".omp_offloading.entries_begin");
	llvm::GlobalVariable *HostEntriesEnd = new llvm::GlobalVariable(
	M, OffloadEntryTy, /isConstant=/true,
	llvm::GlobalValue::ExternalLinkage, /Initializer=/nullptr,
	".omp_offloading.entries_end");

	// Create all device images
	auto *DeviceImageTy = cast<llvm::StructType>(
	CGM.getTypes().ConvertTypeForMem(getTgtDeviceImageQTy()));
	ConstantInitBuilder DeviceImagesBuilder(CGM);
	auto DeviceImagesEntries = DeviceImagesBuilder.beginArray(DeviceImageTy);

	for (unsigned i = 0; i < Devices.size(); ++i) {
	StringRef T = Devices[i].getTriple();
	auto *ImgBegin = new llvm::GlobalVariable(
	M, CGM.Int8Ty, /isConstant=/true, llvm::GlobalValue::ExternalLinkage,
	/Initializer=/nullptr,
	Twine(".omp_offloading.img_start.") + Twine(T));
	auto *ImgEnd = new llvm::GlobalVariable(
	M, CGM.Int8Ty, /isConstant=/true, llvm::GlobalValue::ExternalLinkage,
	/Initializer=/nullptr, Twine(".omp_offloading.img_end.") + Twine(T));

	auto Dev = DeviceImagesEntries.beginStruct(DeviceImageTy);
	Dev.add(ImgBegin);
	Dev.add(ImgEnd);
	Dev.add(HostEntriesBegin);
	Dev.add(HostEntriesEnd);
	Dev.finishAndAddTo(DeviceImagesEntries);
	}

	// Create device images global array.
	llvm::GlobalVariable *DeviceImages =
	DeviceImagesEntries.finishAndCreateGlobal(".omp_offloading.device_images",
	CGM.getPointerAlign(),
	/isConstant=/true);
	DeviceImages->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global);

	// This is a Zero array to be used in the creation of the constant expressions
	llvm::Constant *Index[] = {llvm::Constant::getNullValue(CGM.Int32Ty),
	llvm::Constant::getNullValue(CGM.Int32Ty)};

	// Create the target region descriptor.
	auto *BinaryDescriptorTy = cast<llvm::StructType>(
	CGM.getTypes().ConvertTypeForMem(getTgtBinaryDescriptorQTy()));
	ConstantInitBuilder DescBuilder(CGM);
	auto DescInit = DescBuilder.beginStruct(BinaryDescriptorTy);
	DescInit.addInt(CGM.Int32Ty, Devices.size());
	DescInit.add(llvm::ConstantExpr::getGetElementPtr(DeviceImages->getValueType(),
	DeviceImages,
	Index));
	DescInit.add(HostEntriesBegin);
	DescInit.add(HostEntriesEnd);

	auto *Desc = DescInit.finishAndCreateGlobal(".omp_offloading.descriptor",
	CGM.getPointerAlign(),
	/isConstant=/true);

	// Emit code to register or unregister the descriptor at execution
	// startup or closing, respectively.

	// Create a variable to drive the registration and unregistration of the
	// descriptor, so we can reuse the logic that emits Ctors and Dtors.
	auto *IdentInfo = &C.Idents.get(".omp_offloading.reg_unreg_var");
	ImplicitParamDecl RegUnregVar(C, C.getTranslationUnitDecl(), SourceLocation(),
	IdentInfo, C.CharTy);

	auto *UnRegFn = createOffloadingBinaryDescriptorFunction(
	CGM, ".omp_offloading.descriptor_unreg",
	[&](CodeGenFunction &CGF, PrePostActionTy &) {
	CGF.EmitCallOrInvoke(createRuntimeFunction(OMPRTL__tgt_unregister_lib),
	Desc);
	});
	auto *RegFn = createOffloadingBinaryDescriptorFunction(
	CGM, ".omp_offloading.descriptor_reg",
	[&](CodeGenFunction &CGF, PrePostActionTy &) {
	CGF.EmitCallOrInvoke(createRuntimeFunction(OMPRTL__tgt_register_lib),
	Desc);
	CGM.getCXXABI().registerGlobalDtor(CGF, RegUnregVar, UnRegFn, Desc);
	});
	return RegFn;
	}

	void CGOpenMPRuntime::createOffloadEntry(llvm::Constant *ID,
	llvm::Constant *Addr, uint64_t Size,
	int32_t Flags) {
	StringRef Name = Addr->getName();
	auto *TgtOffloadEntryType = cast<llvm::StructType>(
	CGM.getTypes().ConvertTypeForMem(getTgtOffloadEntryQTy()));
	llvm::LLVMContext &C = CGM.getModule().getContext();
	llvm::Module &M = CGM.getModule();

	// Make sure the address has the right type.
	llvm::Constant *AddrPtr = llvm::ConstantExpr::getBitCast(ID, CGM.VoidPtrTy);

	// Create constant string with the name.
	llvm::Constant *StrPtrInit = llvm::ConstantDataArray::getString(C, Name);

	llvm::GlobalVariable *Str =
	new llvm::GlobalVariable(M, StrPtrInit->getType(), /isConstant=/true,
	llvm::GlobalValue::InternalLinkage, StrPtrInit,
	".omp_offloading.entry_name");
	Str->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global);
	llvm::Constant *StrPtr = llvm::ConstantExpr::getBitCast(Str, CGM.Int8PtrTy);

	// We can't have any padding between symbols, so we need to have 1-byte
	// alignment.
	auto Align = CharUnits::fromQuantity(1);

	// Create the entry struct.
	ConstantInitBuilder EntryBuilder(CGM);
	auto EntryInit = EntryBuilder.beginStruct(TgtOffloadEntryType);
	EntryInit.add(AddrPtr);
	EntryInit.add(StrPtr);
	EntryInit.addInt(CGM.SizeTy, Size);
	EntryInit.addInt(CGM.Int32Ty, Flags);
	EntryInit.addInt(CGM.Int32Ty, 0);
	llvm::GlobalVariable *Entry =
	EntryInit.finishAndCreateGlobal(".omp_offloading.entry",
	Align,
	/constant/ true,
	llvm::GlobalValue::ExternalLinkage);

	// The entry has to be created in the section the linker expects it to be.
	Entry->setSection(".omp_offloading.entries");
	}

	void CGOpenMPRuntime::createOffloadEntriesAndInfoMetadata() {
	// Emit the offloading entries and metadata so that the device codegen side
	// can easily figure out what to emit. The produced metadata looks like
	// this:
	//
	// !omp_offload.info = !{!1, ...}
	//
	// Right now we only generate metadata for function that contain target
	// regions.

	// If we do not have entries, we dont need to do anything.
	if (OffloadEntriesInfoManager.empty())
	return;

	llvm::Module &M = CGM.getModule();
	llvm::LLVMContext &C = M.getContext();
	SmallVector<OffloadEntriesInfoManagerTy::OffloadEntryInfo *, 16>
	OrderedEntries(OffloadEntriesInfoManager.size());

	// Create the offloading info metadata node.
	llvm::NamedMDNode *MD = M.getOrInsertNamedMetadata("omp_offload.info");

	// Auxiliar methods to create metadata values and strings.
	auto getMDInt = [&](unsigned v) {
	return llvm::ConstantAsMetadata::get(
	llvm::ConstantInt::get(llvm::Type::getInt32Ty(C), v));
	};

	auto getMDString = [&](StringRef v) { return llvm::MDString::get(C, v); };

	// Create function that emits metadata for each target region entry;
	auto &&TargetRegionMetadataEmitter = [&](
	unsigned DeviceID, unsigned FileID, StringRef ParentName, unsigned Line,
	OffloadEntriesInfoManagerTy::OffloadEntryInfoTargetRegion &E) {
	llvm::SmallVector<llvm::Metadata *, 32> Ops;
	// Generate metadata for target regions. Each entry of this metadata
	// contains:
	// - Entry 0 -> Kind of this type of metadata (0).
	// - Entry 1 -> Device ID of the file where the entry was identified.
	// - Entry 2 -> File ID of the file where the entry was identified.
	// - Entry 3 -> Mangled name of the function where the entry was identified.
	// - Entry 4 -> Line in the file where the entry was identified.
	// - Entry 5 -> Order the entry was created.
	// The first element of the metadata node is the kind.
	Ops.push_back(getMDInt(E.getKind()));
	Ops.push_back(getMDInt(DeviceID));
	Ops.push_back(getMDInt(FileID));
	Ops.push_back(getMDString(ParentName));
	Ops.push_back(getMDInt(Line));
	Ops.push_back(getMDInt(E.getOrder()));

	// Save this entry in the right position of the ordered entries array.
	OrderedEntries[E.getOrder()] = &E;

	// Add metadata to the named metadata node.
	MD->addOperand(llvm::MDNode::get(C, Ops));
	};

	OffloadEntriesInfoManager.actOnTargetRegionEntriesInfo(
	TargetRegionMetadataEmitter);

	for (auto *E : OrderedEntries) {
	assert(E && "All ordered entries must exist!");
	if (auto *CE =
	dyn_cast<OffloadEntriesInfoManagerTy::OffloadEntryInfoTargetRegion>(
	E)) {
	assert(CE->getID() && CE->getAddress() &&
	"Entry ID and Addr are invalid!");
	createOffloadEntry(CE->getID(), CE->getAddress(), /Size=/0);
	} else
	llvm_unreachable("Unsupported entry kind.");
	}
	}

	/// \brief Loads all the offload entries information from the host IR
	/// metadata.
	void CGOpenMPRuntime::loadOffloadInfoMetadata() {
	// If we are in target mode, load the metadata from the host IR. This code has
	// to match the metadaata creation in createOffloadEntriesAndInfoMetadata().

	if (!CGM.getLangOpts().OpenMPIsDevice)
	return;

	if (CGM.getLangOpts().OMPHostIRFile.empty())
	return;

	auto Buf = llvm::MemoryBuffer::getFile(CGM.getLangOpts().OMPHostIRFile);
	if (Buf.getError())
	return;

	llvm::LLVMContext C;
	auto ME = expectedToErrorOrAndEmitErrors(
	C, llvm::parseBitcodeFile(Buf.get()->getMemBufferRef(), C));

	if (ME.getError())
	return;

	llvm::NamedMDNode *MD = ME.get()->getNamedMetadata("omp_offload.info");
	if (!MD)
	return;

	for (auto I : MD->operands()) {
	llvm::MDNode *MN = cast<llvm::MDNode>(I);

	auto getMDInt = [&](unsigned Idx) {
	llvm::ConstantAsMetadata *V =
	cast<llvm::ConstantAsMetadata>(MN->getOperand(Idx));
	return cast<llvm::ConstantInt>(V->getValue())->getZExtValue();
	};

	auto getMDString = [&](unsigned Idx) {
	llvm::MDString *V = cast<llvm::MDString>(MN->getOperand(Idx));
	return V->getString();
	};

	switch (getMDInt(0)) {
	default:
	llvm_unreachable("Unexpected metadata!");
	break;
	case OffloadEntriesInfoManagerTy::OffloadEntryInfo::
	OFFLOAD_ENTRY_INFO_TARGET_REGION:
	OffloadEntriesInfoManager.initializeTargetRegionEntryInfo(
	/DeviceID=/getMDInt(1), /FileID=/getMDInt(2),
	/ParentName=/getMDString(3), /Line=/getMDInt(4),
	/Order=/getMDInt(5));
	break;
	}
	}
	}

	void CGOpenMPRuntime::emitKmpRoutineEntryT(QualType KmpInt32Ty) {
	if (!KmpRoutineEntryPtrTy) {
	// Build typedef kmp_int32 (* kmp_routine_entry_t)(kmp_int32, void *); type.
	auto &C = CGM.getContext();
	QualType KmpRoutineEntryTyArgs[] = {KmpInt32Ty, C.VoidPtrTy};
	FunctionProtoType::ExtProtoInfo EPI;
	KmpRoutineEntryPtrQTy = C.getPointerType(
	C.getFunctionType(KmpInt32Ty, KmpRoutineEntryTyArgs, EPI));
	KmpRoutineEntryPtrTy = CGM.getTypes().ConvertType(KmpRoutineEntryPtrQTy);
	}
	}

	static FieldDecl addFieldToRecordDecl(ASTContext &C, DeclContext DC,
	QualType FieldTy) {
	auto *Field = FieldDecl::Create(
	C, DC, SourceLocation(), SourceLocation(), /Id=/nullptr, FieldTy,
	C.getTrivialTypeSourceInfo(FieldTy, SourceLocation()),
	/BW=/nullptr, /Mutable=/false, /InitStyle=/ICIS_NoInit);
	Field->setAccess(AS_public);
	DC->addDecl(Field);
	return Field;
	}

	QualType CGOpenMPRuntime::getTgtOffloadEntryQTy() {

	// Make sure the type of the entry is already created. This is the type we
	// have to create:
	// struct __tgt_offload_entry{
	// void *addr; // Pointer to the offload entry info.
	// // (function or global)
	// char *name; // Name of the function or global.
	// size_t size; // Size of the entry info (0 if it a function).
	// int32_t flags; // Flags associated with the entry, e.g. 'link'.
	// int32_t reserved; // Reserved, to use by the runtime library.
	// };
	if (TgtOffloadEntryQTy.isNull()) {
	ASTContext &C = CGM.getContext();
	auto *RD = C.buildImplicitRecord("__tgt_offload_entry");
	RD->startDefinition();
	addFieldToRecordDecl(C, RD, C.VoidPtrTy);
	addFieldToRecordDecl(C, RD, C.getPointerType(C.CharTy));
	addFieldToRecordDecl(C, RD, C.getSizeType());
	addFieldToRecordDecl(
	C, RD, C.getIntTypeForBitwidth(/DestWidth=/32, /Signed=/true));
	addFieldToRecordDecl(
	C, RD, C.getIntTypeForBitwidth(/DestWidth=/32, /Signed=/true));
	RD->completeDefinition();
	TgtOffloadEntryQTy = C.getRecordType(RD);
	}
	return TgtOffloadEntryQTy;
	}

	QualType CGOpenMPRuntime::getTgtDeviceImageQTy() {
	// These are the types we need to build:
	// struct __tgt_device_image{
	// void *ImageStart; // Pointer to the target code start.
	// void *ImageEnd; // Pointer to the target code end.
	// // We also add the host entries to the device image, as it may be useful
	// // for the target runtime to have access to that information.
	// __tgt_offload_entry *EntriesBegin; // Begin of the table with all
	// // the entries.
	// __tgt_offload_entry *EntriesEnd; // End of the table with all the
	// // entries (non inclusive).
	// };
	if (TgtDeviceImageQTy.isNull()) {
	ASTContext &C = CGM.getContext();
	auto *RD = C.buildImplicitRecord("__tgt_device_image");
	RD->startDefinition();
	addFieldToRecordDecl(C, RD, C.VoidPtrTy);
	addFieldToRecordDecl(C, RD, C.VoidPtrTy);
	addFieldToRecordDecl(C, RD, C.getPointerType(getTgtOffloadEntryQTy()));
	addFieldToRecordDecl(C, RD, C.getPointerType(getTgtOffloadEntryQTy()));
	RD->completeDefinition();
	TgtDeviceImageQTy = C.getRecordType(RD);
	}
	return TgtDeviceImageQTy;
	}

	QualType CGOpenMPRuntime::getTgtBinaryDescriptorQTy() {
	// struct __tgt_bin_desc{
	// int32_t NumDevices; // Number of devices supported.
	// __tgt_device_image *DeviceImages; // Arrays of device images
	// // (one per device).
	// __tgt_offload_entry *EntriesBegin; // Begin of the table with all the
	// // entries.
	// __tgt_offload_entry *EntriesEnd; // End of the table with all the
	// // entries (non inclusive).
	// };
	if (TgtBinaryDescriptorQTy.isNull()) {
	ASTContext &C = CGM.getContext();
	auto *RD = C.buildImplicitRecord("__tgt_bin_desc");
	RD->startDefinition();
	addFieldToRecordDecl(
	C, RD, C.getIntTypeForBitwidth(/DestWidth=/32, /Signed=/true));
	addFieldToRecordDecl(C, RD, C.getPointerType(getTgtDeviceImageQTy()));
	addFieldToRecordDecl(C, RD, C.getPointerType(getTgtOffloadEntryQTy()));
	addFieldToRecordDecl(C, RD, C.getPointerType(getTgtOffloadEntryQTy()));
	RD->completeDefinition();
	TgtBinaryDescriptorQTy = C.getRecordType(RD);
	}
	return TgtBinaryDescriptorQTy;
	}

	namespace {
	struct PrivateHelpersTy {
	PrivateHelpersTy(const VarDecl Original, const VarDecl PrivateCopy,
	const VarDecl *PrivateElemInit)
	: Original(Original), PrivateCopy(PrivateCopy),
	PrivateElemInit(PrivateElemInit) {}
	const VarDecl *Original;
	const VarDecl *PrivateCopy;
	const VarDecl *PrivateElemInit;
	};
	typedef std::pair<CharUnits /Align/, PrivateHelpersTy> PrivateDataTy;
	} // anonymous namespace

	static RecordDecl *
	createPrivatesRecordDecl(CodeGenModule &CGM, ArrayRef<PrivateDataTy> Privates) {
	if (!Privates.empty()) {
	auto &C = CGM.getContext();
	// Build struct .kmp_privates_t. {
	// /* private vars */
	// };
	auto *RD = C.buildImplicitRecord(".kmp_privates.t");
	RD->startDefinition();
	for (auto &&Pair : Privates) {
	auto *VD = Pair.second.Original;
	auto Type = VD->getType();
	Type = Type.getNonReferenceType();
	auto *FD = addFieldToRecordDecl(C, RD, Type);
	if (VD->hasAttrs()) {
	for (specific_attr_iterator<AlignedAttr> I(VD->getAttrs().begin()),
	E(VD->getAttrs().end());
	I != E; ++I)
	FD->addAttr(*I);
	}
	}
	RD->completeDefinition();
	return RD;
	}
	return nullptr;
	}

	static RecordDecl *
	createKmpTaskTRecordDecl(CodeGenModule &CGM, OpenMPDirectiveKind Kind,
	QualType KmpInt32Ty,
	QualType KmpRoutineEntryPointerQTy) {
	auto &C = CGM.getContext();
	// Build struct kmp_task_t {
	// void * shareds;
	// kmp_routine_entry_t routine;
	// kmp_int32 part_id;
	// kmp_cmplrdata_t data1;
	// kmp_cmplrdata_t data2;
	// For taskloops additional fields:
	// kmp_uint64 lb;
	// kmp_uint64 ub;
	// kmp_int64 st;
	// kmp_int32 liter;
	// };
	auto *UD = C.buildImplicitRecord("kmp_cmplrdata_t", TTK_Union);
	UD->startDefinition();
	addFieldToRecordDecl(C, UD, KmpInt32Ty);
	addFieldToRecordDecl(C, UD, KmpRoutineEntryPointerQTy);
	UD->completeDefinition();
	QualType KmpCmplrdataTy = C.getRecordType(UD);
	auto *RD = C.buildImplicitRecord("kmp_task_t");
	RD->startDefinition();
	addFieldToRecordDecl(C, RD, C.VoidPtrTy);
	addFieldToRecordDecl(C, RD, KmpRoutineEntryPointerQTy);
	addFieldToRecordDecl(C, RD, KmpInt32Ty);
	addFieldToRecordDecl(C, RD, KmpCmplrdataTy);
	addFieldToRecordDecl(C, RD, KmpCmplrdataTy);
	if (isOpenMPTaskLoopDirective(Kind)) {
	QualType KmpUInt64Ty =
	CGM.getContext().getIntTypeForBitwidth(/DestWidth=/64, /Signed=/0);
	QualType KmpInt64Ty =
	CGM.getContext().getIntTypeForBitwidth(/DestWidth=/64, /Signed=/1);
	addFieldToRecordDecl(C, RD, KmpUInt64Ty);
	addFieldToRecordDecl(C, RD, KmpUInt64Ty);
	addFieldToRecordDecl(C, RD, KmpInt64Ty);
	addFieldToRecordDecl(C, RD, KmpInt32Ty);
	}
	RD->completeDefinition();
	return RD;
	}

	static RecordDecl *
	createKmpTaskTWithPrivatesRecordDecl(CodeGenModule &CGM, QualType KmpTaskTQTy,
	ArrayRef<PrivateDataTy> Privates) {
	auto &C = CGM.getContext();
	// Build struct kmp_task_t_with_privates {
	// kmp_task_t task_data;
	// .kmp_privates_t. privates;
	// };
	auto *RD = C.buildImplicitRecord("kmp_task_t_with_privates");
	RD->startDefinition();
	addFieldToRecordDecl(C, RD, KmpTaskTQTy);
	if (auto *PrivateRD = createPrivatesRecordDecl(CGM, Privates)) {
	addFieldToRecordDecl(C, RD, C.getRecordType(PrivateRD));
	}
	RD->completeDefinition();
	return RD;
	}

	/// \brief Emit a proxy function which accepts kmp_task_t as the second
	/// argument.
	/// \code
	/// kmp_int32 .omp_task_entry.(kmp_int32 gtid, kmp_task_t *tt) {
	/// TaskFunction(gtid, tt->part_id, &tt->privates, task_privates_map, tt,
	/// For taskloops:
	/// tt->task_data.lb, tt->task_data.ub, tt->task_data.st, tt->task_data.liter,
	/// tt->shareds);
	/// return 0;
	/// }
	/// \endcode
	static llvm::Value *
	emitProxyTaskFunction(CodeGenModule &CGM, SourceLocation Loc,
	OpenMPDirectiveKind Kind, QualType KmpInt32Ty,
	QualType KmpTaskTWithPrivatesPtrQTy,
	QualType KmpTaskTWithPrivatesQTy, QualType KmpTaskTQTy,
	QualType SharedsPtrTy, llvm::Value *TaskFunction,
	llvm::Value *TaskPrivatesMap) {
	auto &C = CGM.getContext();
	FunctionArgList Args;
	ImplicitParamDecl GtidArg(C, /DC=/nullptr, Loc, /Id=/nullptr, KmpInt32Ty);
	ImplicitParamDecl TaskTypeArg(C, /DC=/nullptr, Loc,
	/Id=/nullptr,
	KmpTaskTWithPrivatesPtrQTy.withRestrict());
	Args.push_back(&GtidArg);
	Args.push_back(&TaskTypeArg);
	auto &TaskEntryFnInfo =
	CGM.getTypes().arrangeBuiltinFunctionDeclaration(KmpInt32Ty, Args);
	auto *TaskEntryTy = CGM.getTypes().GetFunctionType(TaskEntryFnInfo);
	auto *TaskEntry =
	llvm::Function::Create(TaskEntryTy, llvm::GlobalValue::InternalLinkage,
	".omp_task_entry.", &CGM.getModule());
	CGM.SetInternalFunctionAttributes(/D=/nullptr, TaskEntry, TaskEntryFnInfo);
	CodeGenFunction CGF(CGM);
	CGF.disableDebugInfo();
	CGF.StartFunction(GlobalDecl(), KmpInt32Ty, TaskEntry, TaskEntryFnInfo, Args);

	// TaskFunction(gtid, tt->task_data.part_id, &tt->privates, task_privates_map,
	// tt,
	// For taskloops:
	// tt->task_data.lb, tt->task_data.ub, tt->task_data.st, tt->task_data.liter,
	// tt->task_data.shareds);
	auto *GtidParam = CGF.EmitLoadOfScalar(
	CGF.GetAddrOfLocalVar(&GtidArg), /Volatile=/false, KmpInt32Ty, Loc);
	LValue TDBase = CGF.EmitLoadOfPointerLValue(
	CGF.GetAddrOfLocalVar(&TaskTypeArg),
	KmpTaskTWithPrivatesPtrQTy->castAs<PointerType>());
	auto *KmpTaskTWithPrivatesQTyRD =
	cast<RecordDecl>(KmpTaskTWithPrivatesQTy->getAsTagDecl());
	LValue Base =
	CGF.EmitLValueForField(TDBase, *KmpTaskTWithPrivatesQTyRD->field_begin());
	auto *KmpTaskTQTyRD = cast<RecordDecl>(KmpTaskTQTy->getAsTagDecl());
	auto PartIdFI = std::next(KmpTaskTQTyRD->field_begin(), KmpTaskTPartId);
	auto PartIdLVal = CGF.EmitLValueForField(Base, *PartIdFI);
	auto *PartidParam = PartIdLVal.getPointer();

	auto SharedsFI = std::next(KmpTaskTQTyRD->field_begin(), KmpTaskTShareds);
	auto SharedsLVal = CGF.EmitLValueForField(Base, *SharedsFI);
	auto *SharedsParam = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
	CGF.EmitLoadOfLValue(SharedsLVal, Loc).getScalarVal(),
	CGF.ConvertTypeForMem(SharedsPtrTy));

	auto PrivatesFI = std::next(KmpTaskTWithPrivatesQTyRD->field_begin(), 1);
	llvm::Value *PrivatesParam;
	if (PrivatesFI != KmpTaskTWithPrivatesQTyRD->field_end()) {
	auto PrivatesLVal = CGF.EmitLValueForField(TDBase, *PrivatesFI);
	PrivatesParam = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
	PrivatesLVal.getPointer(), CGF.VoidPtrTy);
	} else
	PrivatesParam = llvm::ConstantPointerNull::get(CGF.VoidPtrTy);

	llvm::Value *CommonArgs[] = {GtidParam, PartidParam, PrivatesParam,
	TaskPrivatesMap,
	CGF.Builder
	.CreatePointerBitCastOrAddrSpaceCast(
	TDBase.getAddress(), CGF.VoidPtrTy)
	.getPointer()};
	SmallVector<llvm::Value *, 16> CallArgs(std::begin(CommonArgs),
	std::end(CommonArgs));
	if (isOpenMPTaskLoopDirective(Kind)) {
	auto LBFI = std::next(KmpTaskTQTyRD->field_begin(), KmpTaskTLowerBound);
	auto LBLVal = CGF.EmitLValueForField(Base, *LBFI);
	auto *LBParam = CGF.EmitLoadOfLValue(LBLVal, Loc).getScalarVal();
	auto UBFI = std::next(KmpTaskTQTyRD->field_begin(), KmpTaskTUpperBound);
	auto UBLVal = CGF.EmitLValueForField(Base, *UBFI);
	auto *UBParam = CGF.EmitLoadOfLValue(UBLVal, Loc).getScalarVal();
	auto StFI = std::next(KmpTaskTQTyRD->field_begin(), KmpTaskTStride);
	auto StLVal = CGF.EmitLValueForField(Base, *StFI);
	auto *StParam = CGF.EmitLoadOfLValue(StLVal, Loc).getScalarVal();
	auto LIFI = std::next(KmpTaskTQTyRD->field_begin(), KmpTaskTLastIter);
	auto LILVal = CGF.EmitLValueForField(Base, *LIFI);
	auto *LIParam = CGF.EmitLoadOfLValue(LILVal, Loc).getScalarVal();
	CallArgs.push_back(LBParam);
	CallArgs.push_back(UBParam);
	CallArgs.push_back(StParam);
	CallArgs.push_back(LIParam);
	}
	CallArgs.push_back(SharedsParam);

	CGF.EmitCallOrInvoke(TaskFunction, CallArgs);
	CGF.EmitStoreThroughLValue(
	RValue::get(CGF.Builder.getInt32(/C=/0)),
	CGF.MakeAddrLValue(CGF.ReturnValue, KmpInt32Ty));
	CGF.FinishFunction();
	return TaskEntry;
	}

	static llvm::Value *emitDestructorsFunction(CodeGenModule &CGM,
	SourceLocation Loc,
	QualType KmpInt32Ty,
	QualType KmpTaskTWithPrivatesPtrQTy,
	QualType KmpTaskTWithPrivatesQTy) {
	auto &C = CGM.getContext();
	FunctionArgList Args;
	ImplicitParamDecl GtidArg(C, /DC=/nullptr, Loc, /Id=/nullptr, KmpInt32Ty);
	ImplicitParamDecl TaskTypeArg(C, /DC=/nullptr, Loc,
	/Id=/nullptr,
	KmpTaskTWithPrivatesPtrQTy.withRestrict());
	Args.push_back(&GtidArg);
	Args.push_back(&TaskTypeArg);
	FunctionType::ExtInfo Info;
	auto &DestructorFnInfo =
	CGM.getTypes().arrangeBuiltinFunctionDeclaration(KmpInt32Ty, Args);
	auto *DestructorFnTy = CGM.getTypes().GetFunctionType(DestructorFnInfo);
	auto *DestructorFn =
	llvm::Function::Create(DestructorFnTy, llvm::GlobalValue::InternalLinkage,
	".omp_task_destructor.", &CGM.getModule());
	CGM.SetInternalFunctionAttributes(/D=/nullptr, DestructorFn,
	DestructorFnInfo);
	CodeGenFunction CGF(CGM);
	CGF.disableDebugInfo();
	CGF.StartFunction(GlobalDecl(), KmpInt32Ty, DestructorFn, DestructorFnInfo,
	Args);

	LValue Base = CGF.EmitLoadOfPointerLValue(
	CGF.GetAddrOfLocalVar(&TaskTypeArg),
	KmpTaskTWithPrivatesPtrQTy->castAs<PointerType>());
	auto *KmpTaskTWithPrivatesQTyRD =
	cast<RecordDecl>(KmpTaskTWithPrivatesQTy->getAsTagDecl());
	auto FI = std::next(KmpTaskTWithPrivatesQTyRD->field_begin());
	Base = CGF.EmitLValueForField(Base, *FI);
	for (auto *Field :
	cast<RecordDecl>(FI->getType()->getAsTagDecl())->fields()) {
	if (auto DtorKind = Field->getType().isDestructedType()) {
	auto FieldLValue = CGF.EmitLValueForField(Base, Field);
	CGF.pushDestroy(DtorKind, FieldLValue.getAddress(), Field->getType());
	}
	}
	CGF.FinishFunction();
	return DestructorFn;
	}

	/// \brief Emit a privates mapping function for correct handling of private and
	/// firstprivate variables.
	/// \code
	/// void .omp_task_privates_map.(const .privates. *noalias privs, <ty1>
	/// noalias priv1,..., <tyn> noalias privn) {
	/// *priv1 = &.privates.priv1;
	/// ...;
	/// *privn = &.privates.privn;
	/// }
	/// \endcode
	static llvm::Value *
	emitTaskPrivateMappingFunction(CodeGenModule &CGM, SourceLocation Loc,
	ArrayRef<const Expr *> PrivateVars,
	ArrayRef<const Expr *> FirstprivateVars,
	ArrayRef<const Expr *> LastprivateVars,
	QualType PrivatesQTy,
	ArrayRef<PrivateDataTy> Privates) {
	auto &C = CGM.getContext();
	FunctionArgList Args;
	ImplicitParamDecl TaskPrivatesArg(
	C, /DC=/nullptr, Loc, /Id=/nullptr,
	C.getPointerType(PrivatesQTy).withConst().withRestrict());
	Args.push_back(&TaskPrivatesArg);
	llvm::DenseMap<const VarDecl *, unsigned> PrivateVarsPos;
	unsigned Counter = 1;
	for (auto *E: PrivateVars) {
	Args.push_back(ImplicitParamDecl::Create(
	C, /DC=/nullptr, Loc,
	/Id=/nullptr, C.getPointerType(C.getPointerType(E->getType()))
	.withConst()
	.withRestrict()));
	auto *VD = cast<VarDecl>(cast<DeclRefExpr>(E)->getDecl());
	PrivateVarsPos[VD] = Counter;
	++Counter;
	}
	for (auto *E : FirstprivateVars) {
	Args.push_back(ImplicitParamDecl::Create(
	C, /DC=/nullptr, Loc,
	/Id=/nullptr, C.getPointerType(C.getPointerType(E->getType()))
	.withConst()
	.withRestrict()));
	auto *VD = cast<VarDecl>(cast<DeclRefExpr>(E)->getDecl());
	PrivateVarsPos[VD] = Counter;
	++Counter;
	}
	for (auto *E: LastprivateVars) {
	Args.push_back(ImplicitParamDecl::Create(
	C, /DC=/nullptr, Loc,
	/Id=/nullptr, C.getPointerType(C.getPointerType(E->getType()))
	.withConst()
	.withRestrict()));
	auto *VD = cast<VarDecl>(cast<DeclRefExpr>(E)->getDecl());
	PrivateVarsPos[VD] = Counter;
	++Counter;
	}
	auto &TaskPrivatesMapFnInfo =
	CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
	auto *TaskPrivatesMapTy =
	CGM.getTypes().GetFunctionType(TaskPrivatesMapFnInfo);
	auto *TaskPrivatesMap = llvm::Function::Create(
	TaskPrivatesMapTy, llvm::GlobalValue::InternalLinkage,
	".omp_task_privates_map.", &CGM.getModule());
	CGM.SetInternalFunctionAttributes(/D=/nullptr, TaskPrivatesMap,
	TaskPrivatesMapFnInfo);
	TaskPrivatesMap->removeFnAttr(llvm::Attribute::NoInline);
	TaskPrivatesMap->addFnAttr(llvm::Attribute::AlwaysInline);
	CodeGenFunction CGF(CGM);
	CGF.disableDebugInfo();
	CGF.StartFunction(GlobalDecl(), C.VoidTy, TaskPrivatesMap,
	TaskPrivatesMapFnInfo, Args);

	// *privi = &.privates.privi;
	LValue Base = CGF.EmitLoadOfPointerLValue(
	CGF.GetAddrOfLocalVar(&TaskPrivatesArg),
	TaskPrivatesArg.getType()->castAs<PointerType>());
	auto *PrivatesQTyRD = cast<RecordDecl>(PrivatesQTy->getAsTagDecl());
	Counter = 0;
	for (auto *Field : PrivatesQTyRD->fields()) {
	auto FieldLVal = CGF.EmitLValueForField(Base, Field);
	auto *VD = Args[PrivateVarsPos[Privates[Counter].second.Original]];
	auto RefLVal = CGF.MakeAddrLValue(CGF.GetAddrOfLocalVar(VD), VD->getType());
	auto RefLoadLVal = CGF.EmitLoadOfPointerLValue(
	RefLVal.getAddress(), RefLVal.getType()->castAs<PointerType>());
	CGF.EmitStoreOfScalar(FieldLVal.getPointer(), RefLoadLVal);
	++Counter;
	}
	CGF.FinishFunction();
	return TaskPrivatesMap;
	}

	static int array_pod_sort_comparator(const PrivateDataTy *P1,
	const PrivateDataTy *P2) {
	return P1->first < P2->first ? 1 : (P2->first < P1->first ? -1 : 0);
	}

	/// Emit initialization for private variables in task-based directives.
	static void emitPrivatesInit(CodeGenFunction &CGF,
	const OMPExecutableDirective &D,
	Address KmpTaskSharedsPtr, LValue TDBase,
	const RecordDecl *KmpTaskTWithPrivatesQTyRD,
	QualType SharedsTy, QualType SharedsPtrTy,
	const OMPTaskDataTy &Data,
	ArrayRef<PrivateDataTy> Privates, bool ForDup) {
	auto &C = CGF.getContext();
	auto FI = std::next(KmpTaskTWithPrivatesQTyRD->field_begin());
	LValue PrivatesBase = CGF.EmitLValueForField(TDBase, *FI);
	LValue SrcBase;
	if (!Data.FirstprivateVars.empty()) {
	SrcBase = CGF.MakeAddrLValue(
	CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
	KmpTaskSharedsPtr, CGF.ConvertTypeForMem(SharedsPtrTy)),
	SharedsTy);
	}
	CodeGenFunction::CGCapturedStmtInfo CapturesInfo(
	cast<CapturedStmt>(*D.getAssociatedStmt()));
	FI = cast<RecordDecl>(FI->getType()->getAsTagDecl())->field_begin();
	for (auto &&Pair : Privates) {
	auto *VD = Pair.second.PrivateCopy;
	auto *Init = VD->getAnyInitializer();
	if (Init && (!ForDup \|\| (isa<CXXConstructExpr>(Init) &&
	!CGF.isTrivialInitializer(Init)))) {
	LValue PrivateLValue = CGF.EmitLValueForField(PrivatesBase, *FI);
	if (auto *Elem = Pair.second.PrivateElemInit) {
	auto *OriginalVD = Pair.second.Original;
	auto *SharedField = CapturesInfo.lookup(OriginalVD);
	auto SharedRefLValue = CGF.EmitLValueForField(SrcBase, SharedField);
	SharedRefLValue = CGF.MakeAddrLValue(
	Address(SharedRefLValue.getPointer(), C.getDeclAlign(OriginalVD)),
	SharedRefLValue.getType(), AlignmentSource::Decl);
	QualType Type = OriginalVD->getType();
	if (Type->isArrayType()) {
	// Initialize firstprivate array.
	if (!isa<CXXConstructExpr>(Init) \|\| CGF.isTrivialInitializer(Init)) {
	// Perform simple memcpy.
	CGF.EmitAggregateAssign(PrivateLValue.getAddress(),
	SharedRefLValue.getAddress(), Type);
	} else {
	// Initialize firstprivate array using element-by-element
	// intialization.
	CGF.EmitOMPAggregateAssign(
	PrivateLValue.getAddress(), SharedRefLValue.getAddress(), Type,
	[&CGF, Elem, Init, &CapturesInfo](Address DestElement,
	Address SrcElement) {
	// Clean up any temporaries needed by the initialization.
	CodeGenFunction::OMPPrivateScope InitScope(CGF);
	InitScope.addPrivate(
	Elem, [SrcElement]() -> Address { return SrcElement; });
	(void)InitScope.Privatize();
	// Emit initialization for single element.
	CodeGenFunction::CGCapturedStmtRAII CapInfoRAII(
	CGF, &CapturesInfo);
	CGF.EmitAnyExprToMem(Init, DestElement,
	Init->getType().getQualifiers(),
	/IsInitializer=/false);
	});
	}
	} else {
	CodeGenFunction::OMPPrivateScope InitScope(CGF);
	InitScope.addPrivate(Elem, [SharedRefLValue]() -> Address {
	return SharedRefLValue.getAddress();
	});
	(void)InitScope.Privatize();
	CodeGenFunction::CGCapturedStmtRAII CapInfoRAII(CGF, &CapturesInfo);
	CGF.EmitExprAsInit(Init, VD, PrivateLValue,
	/capturedByInit=/false);
	}
	} else
	CGF.EmitExprAsInit(Init, VD, PrivateLValue, /capturedByInit=/false);
	}
	++FI;
	}
	}

	/// Check if duplication function is required for taskloops.
	static bool checkInitIsRequired(CodeGenFunction &CGF,
	ArrayRef<PrivateDataTy> Privates) {
	bool InitRequired = false;
	for (auto &&Pair : Privates) {
	auto *VD = Pair.second.PrivateCopy;
	auto *Init = VD->getAnyInitializer();
	InitRequired = InitRequired \|\| (Init && isa<CXXConstructExpr>(Init) &&
	!CGF.isTrivialInitializer(Init));
	}
	return InitRequired;
	}


	/// Emit task_dup function (for initialization of
	/// private/firstprivate/lastprivate vars and last_iter flag)
	/// \code
	/// void __task_dup_entry(kmp_task_t task_dst, const kmp_task_t task_src, int
	/// lastpriv) {
	/// // setup lastprivate flag
	/// task_dst->last = lastpriv;
	/// // could be constructor calls here...
	/// }
	/// \endcode
	static llvm::Value *
	emitTaskDupFunction(CodeGenModule &CGM, SourceLocation Loc,
	const OMPExecutableDirective &D,
	QualType KmpTaskTWithPrivatesPtrQTy,
	const RecordDecl *KmpTaskTWithPrivatesQTyRD,
	const RecordDecl *KmpTaskTQTyRD, QualType SharedsTy,
	QualType SharedsPtrTy, const OMPTaskDataTy &Data,
	ArrayRef<PrivateDataTy> Privates, bool WithLastIter) {
	auto &C = CGM.getContext();
	FunctionArgList Args;
	ImplicitParamDecl DstArg(C, /DC=/nullptr, Loc,
	/Id=/nullptr, KmpTaskTWithPrivatesPtrQTy);
	ImplicitParamDecl SrcArg(C, /DC=/nullptr, Loc,
	/Id=/nullptr, KmpTaskTWithPrivatesPtrQTy);
	ImplicitParamDecl LastprivArg(C, /DC=/nullptr, Loc,
	/Id=/nullptr, C.IntTy);
	Args.push_back(&DstArg);
	Args.push_back(&SrcArg);
	Args.push_back(&LastprivArg);
	auto &TaskDupFnInfo =
	CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
	auto *TaskDupTy = CGM.getTypes().GetFunctionType(TaskDupFnInfo);
	auto *TaskDup =
	llvm::Function::Create(TaskDupTy, llvm::GlobalValue::InternalLinkage,
	".omp_task_dup.", &CGM.getModule());
	CGM.SetInternalFunctionAttributes(/D=/nullptr, TaskDup, TaskDupFnInfo);
	CodeGenFunction CGF(CGM);
	CGF.disableDebugInfo();
	CGF.StartFunction(GlobalDecl(), C.VoidTy, TaskDup, TaskDupFnInfo, Args);

	LValue TDBase = CGF.EmitLoadOfPointerLValue(
	CGF.GetAddrOfLocalVar(&DstArg),
	KmpTaskTWithPrivatesPtrQTy->castAs<PointerType>());
	// task_dst->liter = lastpriv;
	if (WithLastIter) {
	auto LIFI = std::next(KmpTaskTQTyRD->field_begin(), KmpTaskTLastIter);
	LValue Base = CGF.EmitLValueForField(
	TDBase, *KmpTaskTWithPrivatesQTyRD->field_begin());
	LValue LILVal = CGF.EmitLValueForField(Base, *LIFI);
	llvm::Value *Lastpriv = CGF.EmitLoadOfScalar(
	CGF.GetAddrOfLocalVar(&LastprivArg), /Volatile=/false, C.IntTy, Loc);
	CGF.EmitStoreOfScalar(Lastpriv, LILVal);
	}

	// Emit initial values for private copies (if any).
	assert(!Privates.empty());
	Address KmpTaskSharedsPtr = Address::invalid();
	if (!Data.FirstprivateVars.empty()) {
	LValue TDBase = CGF.EmitLoadOfPointerLValue(
	CGF.GetAddrOfLocalVar(&SrcArg),
	KmpTaskTWithPrivatesPtrQTy->castAs<PointerType>());
	LValue Base = CGF.EmitLValueForField(
	TDBase, *KmpTaskTWithPrivatesQTyRD->field_begin());
	KmpTaskSharedsPtr = Address(
	CGF.EmitLoadOfScalar(CGF.EmitLValueForField(
	Base, *std::next(KmpTaskTQTyRD->field_begin(),
	KmpTaskTShareds)),
	Loc),
	CGF.getNaturalTypeAlignment(SharedsTy));
	}
	emitPrivatesInit(CGF, D, KmpTaskSharedsPtr, TDBase, KmpTaskTWithPrivatesQTyRD,
	SharedsTy, SharedsPtrTy, Data, Privates, /ForDup=/true);
	CGF.FinishFunction();
	return TaskDup;
	}

	/// Checks if destructor function is required to be generated.
	/// \return true if cleanups are required, false otherwise.
	static bool
	checkDestructorsRequired(const RecordDecl *KmpTaskTWithPrivatesQTyRD) {
	bool NeedsCleanup = false;
	auto FI = std::next(KmpTaskTWithPrivatesQTyRD->field_begin());
	auto *PrivateRD = cast<RecordDecl>(FI->getType()->getAsTagDecl());
	for (auto *FD : PrivateRD->fields()) {
	NeedsCleanup = NeedsCleanup \|\| FD->getType().isDestructedType();
	if (NeedsCleanup)
	break;
	}
	return NeedsCleanup;
	}

	CGOpenMPRuntime::TaskResultTy
	CGOpenMPRuntime::emitTaskInit(CodeGenFunction &CGF, SourceLocation Loc,
	const OMPExecutableDirective &D,
	llvm::Value *TaskFunction, QualType SharedsTy,
	Address Shareds, const OMPTaskDataTy &Data) {
	auto &C = CGM.getContext();
	llvm::SmallVector<PrivateDataTy, 4> Privates;
	// Aggregate privates and sort them by the alignment.
	auto I = Data.PrivateCopies.begin();
	for (auto *E : Data.PrivateVars) {
	auto *VD = cast<VarDecl>(cast<DeclRefExpr>(E)->getDecl());
	Privates.push_back(std::make_pair(
	C.getDeclAlign(VD),
	PrivateHelpersTy(VD, cast<VarDecl>(cast<DeclRefExpr>(*I)->getDecl()),
	/PrivateElemInit=/nullptr)));
	++I;
	}
	I = Data.FirstprivateCopies.begin();
	auto IElemInitRef = Data.FirstprivateInits.begin();
	for (auto *E : Data.FirstprivateVars) {
	auto *VD = cast<VarDecl>(cast<DeclRefExpr>(E)->getDecl());
	Privates.push_back(std::make_pair(
	C.getDeclAlign(VD),
	PrivateHelpersTy(
	VD, cast<VarDecl>(cast<DeclRefExpr>(*I)->getDecl()),
	cast<VarDecl>(cast<DeclRefExpr>(*IElemInitRef)->getDecl()))));
	++I;
	++IElemInitRef;
	}
	I = Data.LastprivateCopies.begin();
	for (auto *E : Data.LastprivateVars) {
	auto *VD = cast<VarDecl>(cast<DeclRefExpr>(E)->getDecl());
	Privates.push_back(std::make_pair(
	C.getDeclAlign(VD),
	PrivateHelpersTy(VD, cast<VarDecl>(cast<DeclRefExpr>(*I)->getDecl()),
	/PrivateElemInit=/nullptr)));
	++I;
	}
	llvm::array_pod_sort(Privates.begin(), Privates.end(),
	array_pod_sort_comparator);
	auto KmpInt32Ty = C.getIntTypeForBitwidth(/DestWidth=/32, /Signed=/1);
	// Build type kmp_routine_entry_t (if not built yet).
	emitKmpRoutineEntryT(KmpInt32Ty);
	// Build type kmp_task_t (if not built yet).
	if (KmpTaskTQTy.isNull()) {
	KmpTaskTQTy = C.getRecordType(createKmpTaskTRecordDecl(
	CGM, D.getDirectiveKind(), KmpInt32Ty, KmpRoutineEntryPtrQTy));
	}
	auto *KmpTaskTQTyRD = cast<RecordDecl>(KmpTaskTQTy->getAsTagDecl());
	// Build particular struct kmp_task_t for the given task.
	auto *KmpTaskTWithPrivatesQTyRD =
	createKmpTaskTWithPrivatesRecordDecl(CGM, KmpTaskTQTy, Privates);
	auto KmpTaskTWithPrivatesQTy = C.getRecordType(KmpTaskTWithPrivatesQTyRD);
	QualType KmpTaskTWithPrivatesPtrQTy =
	C.getPointerType(KmpTaskTWithPrivatesQTy);
	auto *KmpTaskTWithPrivatesTy = CGF.ConvertType(KmpTaskTWithPrivatesQTy);
	auto *KmpTaskTWithPrivatesPtrTy = KmpTaskTWithPrivatesTy->getPointerTo();
	auto *KmpTaskTWithPrivatesTySize = CGF.getTypeSize(KmpTaskTWithPrivatesQTy);
	QualType SharedsPtrTy = C.getPointerType(SharedsTy);

	// Emit initial values for private copies (if any).
	llvm::Value *TaskPrivatesMap = nullptr;
	auto *TaskPrivatesMapTy =
	std::next(cast<llvm::Function>(TaskFunction)->getArgumentList().begin(),
	3)
	->getType();
	if (!Privates.empty()) {
	auto FI = std::next(KmpTaskTWithPrivatesQTyRD->field_begin());
	TaskPrivatesMap = emitTaskPrivateMappingFunction(
	CGM, Loc, Data.PrivateVars, Data.FirstprivateVars, Data.LastprivateVars,
	FI->getType(), Privates);
	TaskPrivatesMap = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
	TaskPrivatesMap, TaskPrivatesMapTy);
	} else {
	TaskPrivatesMap = llvm::ConstantPointerNull::get(
	cast<llvm::PointerType>(TaskPrivatesMapTy));
	}
	// Build a proxy function kmp_int32 .omp_task_entry.(kmp_int32 gtid,
	// kmp_task_t *tt);
	auto *TaskEntry = emitProxyTaskFunction(
	CGM, Loc, D.getDirectiveKind(), KmpInt32Ty, KmpTaskTWithPrivatesPtrQTy,
	KmpTaskTWithPrivatesQTy, KmpTaskTQTy, SharedsPtrTy, TaskFunction,
	TaskPrivatesMap);

	// Build call kmp_task_t * __kmpc_omp_task_alloc(ident_t *, kmp_int32 gtid,
	// kmp_int32 flags, size_t sizeof_kmp_task_t, size_t sizeof_shareds,
	// kmp_routine_entry_t *task_entry);
	// Task flags. Format is taken from
	// http://llvm.org/svn/llvm-project/openmp/trunk/runtime/src/kmp.h,
	// description of kmp_tasking_flags struct.
	enum {
	TiedFlag = 0x1,
	FinalFlag = 0x2,
	DestructorsFlag = 0x8,
	PriorityFlag = 0x20
	};
	unsigned Flags = Data.Tied ? TiedFlag : 0;
	bool NeedsCleanup = false;
	if (!Privates.empty()) {
	NeedsCleanup = checkDestructorsRequired(KmpTaskTWithPrivatesQTyRD);
	if (NeedsCleanup)
	Flags = Flags \| DestructorsFlag;
	}
	if (Data.Priority.getInt())
	Flags = Flags \| PriorityFlag;
	auto *TaskFlags =
	Data.Final.getPointer()
	? CGF.Builder.CreateSelect(Data.Final.getPointer(),
	CGF.Builder.getInt32(FinalFlag),
	CGF.Builder.getInt32(/C=/0))
	: CGF.Builder.getInt32(Data.Final.getInt() ? FinalFlag : 0);
	TaskFlags = CGF.Builder.CreateOr(TaskFlags, CGF.Builder.getInt32(Flags));
	auto *SharedsSize = CGM.getSize(C.getTypeSizeInChars(SharedsTy));
	llvm::Value *AllocArgs[] = {emitUpdateLocation(CGF, Loc),
	getThreadID(CGF, Loc), TaskFlags,
	KmpTaskTWithPrivatesTySize, SharedsSize,
	CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
	TaskEntry, KmpRoutineEntryPtrTy)};
	auto *NewTask = CGF.EmitRuntimeCall(
	createRuntimeFunction(OMPRTL__kmpc_omp_task_alloc), AllocArgs);
	auto *NewTaskNewTaskTTy = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
	NewTask, KmpTaskTWithPrivatesPtrTy);
	LValue Base = CGF.MakeNaturalAlignAddrLValue(NewTaskNewTaskTTy,
	KmpTaskTWithPrivatesQTy);
	LValue TDBase =
	CGF.EmitLValueForField(Base, *KmpTaskTWithPrivatesQTyRD->field_begin());
	// Fill the data in the resulting kmp_task_t record.
	// Copy shareds if there are any.
	Address KmpTaskSharedsPtr = Address::invalid();
	if (!SharedsTy->getAsStructureType()->getDecl()->field_empty()) {
	KmpTaskSharedsPtr =
	Address(CGF.EmitLoadOfScalar(
	CGF.EmitLValueForField(
	TDBase, *std::next(KmpTaskTQTyRD->field_begin(),
	KmpTaskTShareds)),
	Loc),
	CGF.getNaturalTypeAlignment(SharedsTy));
	CGF.EmitAggregateCopy(KmpTaskSharedsPtr, Shareds, SharedsTy);
	}
	// Emit initial values for private copies (if any).
	TaskResultTy Result;
	if (!Privates.empty()) {
	emitPrivatesInit(CGF, D, KmpTaskSharedsPtr, Base, KmpTaskTWithPrivatesQTyRD,
	SharedsTy, SharedsPtrTy, Data, Privates,
	/ForDup=/false);
	if (isOpenMPTaskLoopDirective(D.getDirectiveKind()) &&
	(!Data.LastprivateVars.empty() \|\| checkInitIsRequired(CGF, Privates))) {
	Result.TaskDupFn = emitTaskDupFunction(
	CGM, Loc, D, KmpTaskTWithPrivatesPtrQTy, KmpTaskTWithPrivatesQTyRD,
	KmpTaskTQTyRD, SharedsTy, SharedsPtrTy, Data, Privates,
	/WithLastIter=/!Data.LastprivateVars.empty());
	}
	}
	// Fields of union "kmp_cmplrdata_t" for destructors and priority.
	enum { Priority = 0, Destructors = 1 };
	// Provide pointer to function with destructors for privates.
	auto FI = std::next(KmpTaskTQTyRD->field_begin(), Data1);
	auto KmpCmplrdataUD = (FI)->getType()->getAsUnionType()->getDecl();
	if (NeedsCleanup) {
	llvm::Value *DestructorFn = emitDestructorsFunction(
	CGM, Loc, KmpInt32Ty, KmpTaskTWithPrivatesPtrQTy,
	KmpTaskTWithPrivatesQTy);
	LValue Data1LV = CGF.EmitLValueForField(TDBase, *FI);
	LValue DestructorsLV = CGF.EmitLValueForField(
	Data1LV, *std::next(KmpCmplrdataUD->field_begin(), Destructors));
	CGF.EmitStoreOfScalar(CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
	DestructorFn, KmpRoutineEntryPtrTy),
	DestructorsLV);
	}
	// Set priority.
	if (Data.Priority.getInt()) {
	LValue Data2LV = CGF.EmitLValueForField(
	TDBase, *std::next(KmpTaskTQTyRD->field_begin(), Data2));
	LValue PriorityLV = CGF.EmitLValueForField(
	Data2LV, *std::next(KmpCmplrdataUD->field_begin(), Priority));
	CGF.EmitStoreOfScalar(Data.Priority.getPointer(), PriorityLV);
	}
	Result.NewTask = NewTask;
	Result.TaskEntry = TaskEntry;
	Result.NewTaskNewTaskTTy = NewTaskNewTaskTTy;
	Result.TDBase = TDBase;
	Result.KmpTaskTQTyRD = KmpTaskTQTyRD;
	return Result;
	}

	void CGOpenMPRuntime::emitTaskCall(CodeGenFunction &CGF, SourceLocation Loc,
	const OMPExecutableDirective &D,
	llvm::Value *TaskFunction,
	QualType SharedsTy, Address Shareds,
	const Expr *IfCond,
	const OMPTaskDataTy &Data) {
	if (!CGF.HaveInsertPoint())
	return;

	TaskResultTy Result =
	emitTaskInit(CGF, Loc, D, TaskFunction, SharedsTy, Shareds, Data);
	llvm::Value *NewTask = Result.NewTask;
	llvm::Value *TaskEntry = Result.TaskEntry;
	llvm::Value *NewTaskNewTaskTTy = Result.NewTaskNewTaskTTy;
	LValue TDBase = Result.TDBase;
	RecordDecl *KmpTaskTQTyRD = Result.KmpTaskTQTyRD;
	auto &C = CGM.getContext();
	// Process list of dependences.
	Address DependenciesArray = Address::invalid();
	unsigned NumDependencies = Data.Dependences.size();
	if (NumDependencies) {
	// Dependence kind for RTL.
	enum RTLDependenceKindTy { DepIn = 0x01, DepInOut = 0x3 };
	enum RTLDependInfoFieldsTy { BaseAddr, Len, Flags };
	RecordDecl *KmpDependInfoRD;
	QualType FlagsTy =
	C.getIntTypeForBitwidth(C.getTypeSize(C.BoolTy), /Signed=/false);
	llvm::Type *LLVMFlagsTy = CGF.ConvertTypeForMem(FlagsTy);
	if (KmpDependInfoTy.isNull()) {
	KmpDependInfoRD = C.buildImplicitRecord("kmp_depend_info");
	KmpDependInfoRD->startDefinition();
	addFieldToRecordDecl(C, KmpDependInfoRD, C.getIntPtrType());
	addFieldToRecordDecl(C, KmpDependInfoRD, C.getSizeType());
	addFieldToRecordDecl(C, KmpDependInfoRD, FlagsTy);
	KmpDependInfoRD->completeDefinition();
	KmpDependInfoTy = C.getRecordType(KmpDependInfoRD);
	} else
	KmpDependInfoRD = cast<RecordDecl>(KmpDependInfoTy->getAsTagDecl());
	CharUnits DependencySize = C.getTypeSizeInChars(KmpDependInfoTy);
	// Define type kmp_depend_info[<Dependences.size()>];
	QualType KmpDependInfoArrayTy = C.getConstantArrayType(
	KmpDependInfoTy, llvm::APInt(/numBits=/64, NumDependencies),
	ArrayType::Normal, /IndexTypeQuals=/0);
	// kmp_depend_info[<Dependences.size()>] deps;
	DependenciesArray =
	CGF.CreateMemTemp(KmpDependInfoArrayTy, ".dep.arr.addr");
	for (unsigned i = 0; i < NumDependencies; ++i) {
	const Expr *E = Data.Dependences[i].second;
	auto Addr = CGF.EmitLValue(E);
	llvm::Value *Size;
	QualType Ty = E->getType();
	if (auto *ASE = dyn_cast<OMPArraySectionExpr>(E->IgnoreParenImpCasts())) {
	LValue UpAddrLVal =
	CGF.EmitOMPArraySectionExpr(ASE, /LowerBound=/false);
	llvm::Value *UpAddr =
	CGF.Builder.CreateConstGEP1_32(UpAddrLVal.getPointer(), /Idx0=/1);
	llvm::Value *LowIntPtr =
	CGF.Builder.CreatePtrToInt(Addr.getPointer(), CGM.SizeTy);
	llvm::Value *UpIntPtr = CGF.Builder.CreatePtrToInt(UpAddr, CGM.SizeTy);
	Size = CGF.Builder.CreateNUWSub(UpIntPtr, LowIntPtr);
	} else
	Size = CGF.getTypeSize(Ty);
	auto Base = CGF.MakeAddrLValue(
	CGF.Builder.CreateConstArrayGEP(DependenciesArray, i, DependencySize),
	KmpDependInfoTy);
	// deps[i].base_addr = &<Dependences[i].second>;
	auto BaseAddrLVal = CGF.EmitLValueForField(
	Base, *std::next(KmpDependInfoRD->field_begin(), BaseAddr));
	CGF.EmitStoreOfScalar(
	CGF.Builder.CreatePtrToInt(Addr.getPointer(), CGF.IntPtrTy),
	BaseAddrLVal);
	// deps[i].len = sizeof(<Dependences[i].second>);
	auto LenLVal = CGF.EmitLValueForField(
	Base, *std::next(KmpDependInfoRD->field_begin(), Len));
	CGF.EmitStoreOfScalar(Size, LenLVal);
	// deps[i].flags = <Dependences[i].first>;
	RTLDependenceKindTy DepKind;
	switch (Data.Dependences[i].first) {
	case OMPC_DEPEND_in:
	DepKind = DepIn;
	break;
	// Out and InOut dependencies must use the same code.
	case OMPC_DEPEND_out:
	case OMPC_DEPEND_inout:
	DepKind = DepInOut;
	break;
	case OMPC_DEPEND_source:
	case OMPC_DEPEND_sink:
	case OMPC_DEPEND_unknown:
	llvm_unreachable("Unknown task dependence type");
	}
	auto FlagsLVal = CGF.EmitLValueForField(
	Base, *std::next(KmpDependInfoRD->field_begin(), Flags));
	CGF.EmitStoreOfScalar(llvm::ConstantInt::get(LLVMFlagsTy, DepKind),
	FlagsLVal);
	}
	DependenciesArray = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
	CGF.Builder.CreateStructGEP(DependenciesArray, 0, CharUnits::Zero()),
	CGF.VoidPtrTy);
	}

	// NOTE: routine and part_id fields are intialized by __kmpc_omp_task_alloc()
	// libcall.
	// Build kmp_int32 __kmpc_omp_task_with_deps(ident_t *, kmp_int32 gtid,
	// kmp_task_t new_task, kmp_int32 ndeps, kmp_depend_info_t dep_list,
	// kmp_int32 ndeps_noalias, kmp_depend_info_t *noalias_dep_list) if dependence
	// list is not empty
	auto *ThreadID = getThreadID(CGF, Loc);
	auto *UpLoc = emitUpdateLocation(CGF, Loc);
	llvm::Value *TaskArgs[] = { UpLoc, ThreadID, NewTask };
	llvm::Value *DepTaskArgs[7];
	if (NumDependencies) {
	DepTaskArgs[0] = UpLoc;
	DepTaskArgs[1] = ThreadID;
	DepTaskArgs[2] = NewTask;
	DepTaskArgs[3] = CGF.Builder.getInt32(NumDependencies);
	DepTaskArgs[4] = DependenciesArray.getPointer();
	DepTaskArgs[5] = CGF.Builder.getInt32(0);
	DepTaskArgs[6] = llvm::ConstantPointerNull::get(CGF.VoidPtrTy);
	}
	auto &&ThenCodeGen = [this, Loc, &Data, TDBase, KmpTaskTQTyRD,
	NumDependencies, &TaskArgs,
	&DepTaskArgs](CodeGenFunction &CGF, PrePostActionTy &) {
	if (!Data.Tied) {
	auto PartIdFI = std::next(KmpTaskTQTyRD->field_begin(), KmpTaskTPartId);
	auto PartIdLVal = CGF.EmitLValueForField(TDBase, *PartIdFI);
	CGF.EmitStoreOfScalar(CGF.Builder.getInt32(0), PartIdLVal);
	}
	if (NumDependencies) {
	CGF.EmitRuntimeCall(
	createRuntimeFunction(OMPRTL__kmpc_omp_task_with_deps), DepTaskArgs);
	} else {
	CGF.EmitRuntimeCall(createRuntimeFunction(OMPRTL__kmpc_omp_task),
	TaskArgs);
	}
	// Check if parent region is untied and build return for untied task;
	if (auto *Region =
	dyn_cast_or_null<CGOpenMPRegionInfo>(CGF.CapturedStmtInfo))
	Region->emitUntiedSwitch(CGF);
	};

	llvm::Value *DepWaitTaskArgs[6];
	if (NumDependencies) {
	DepWaitTaskArgs[0] = UpLoc;
	DepWaitTaskArgs[1] = ThreadID;
	DepWaitTaskArgs[2] = CGF.Builder.getInt32(NumDependencies);
	DepWaitTaskArgs[3] = DependenciesArray.getPointer();
	DepWaitTaskArgs[4] = CGF.Builder.getInt32(0);
	DepWaitTaskArgs[5] = llvm::ConstantPointerNull::get(CGF.VoidPtrTy);
	}
	auto &&ElseCodeGen = [&TaskArgs, ThreadID, NewTaskNewTaskTTy, TaskEntry,
	NumDependencies, &DepWaitTaskArgs](CodeGenFunction &CGF,
	PrePostActionTy &) {
	auto &RT = CGF.CGM.getOpenMPRuntime();
	CodeGenFunction::RunCleanupsScope LocalScope(CGF);
	// Build void __kmpc_omp_wait_deps(ident_t *, kmp_int32 gtid,
	// kmp_int32 ndeps, kmp_depend_info_t *dep_list, kmp_int32
	// ndeps_noalias, kmp_depend_info_t *noalias_dep_list); if dependence info
	// is specified.
	if (NumDependencies)
	CGF.EmitRuntimeCall(RT.createRuntimeFunction(OMPRTL__kmpc_omp_wait_deps),
	DepWaitTaskArgs);
	// Call proxy_task_entry(gtid, new_task);
	auto &&CodeGen = [TaskEntry, ThreadID, NewTaskNewTaskTTy](
	CodeGenFunction &CGF, PrePostActionTy &Action) {
	Action.Enter(CGF);
	llvm::Value *OutlinedFnArgs[] = {ThreadID, NewTaskNewTaskTTy};
	CGF.EmitCallOrInvoke(TaskEntry, OutlinedFnArgs);
	};

	// Build void __kmpc_omp_task_begin_if0(ident_t *, kmp_int32 gtid,
	// kmp_task_t *new_task);
	// Build void __kmpc_omp_task_complete_if0(ident_t *, kmp_int32 gtid,
	// kmp_task_t *new_task);
	RegionCodeGenTy RCG(CodeGen);
	CommonActionTy Action(
	RT.createRuntimeFunction(OMPRTL__kmpc_omp_task_begin_if0), TaskArgs,
	RT.createRuntimeFunction(OMPRTL__kmpc_omp_task_complete_if0), TaskArgs);
	RCG.setAction(Action);
	RCG(CGF);
	};

	if (IfCond)
	emitOMPIfClause(CGF, IfCond, ThenCodeGen, ElseCodeGen);
	else {
	RegionCodeGenTy ThenRCG(ThenCodeGen);
	ThenRCG(CGF);
	}
	}

	void CGOpenMPRuntime::emitTaskLoopCall(CodeGenFunction &CGF, SourceLocation Loc,
	const OMPLoopDirective &D,
	llvm::Value *TaskFunction,
	QualType SharedsTy, Address Shareds,
	const Expr *IfCond,
	const OMPTaskDataTy &Data) {
	if (!CGF.HaveInsertPoint())
	return;
	TaskResultTy Result =
	emitTaskInit(CGF, Loc, D, TaskFunction, SharedsTy, Shareds, Data);
	// NOTE: routine and part_id fields are intialized by __kmpc_omp_task_alloc()
	// libcall.
	// Call to void __kmpc_taskloop(ident_t loc, int gtid, kmp_task_t task, int
	// if_val, kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st, int nogroup, int
	// sched, kmp_uint64 grainsize, void *task_dup);
	llvm::Value *ThreadID = getThreadID(CGF, Loc);
	llvm::Value *UpLoc = emitUpdateLocation(CGF, Loc);
	llvm::Value *IfVal;
	if (IfCond) {
	IfVal = CGF.Builder.CreateIntCast(CGF.EvaluateExprAsBool(IfCond), CGF.IntTy,
	/isSigned=/true);
	} else
	IfVal = llvm::ConstantInt::getSigned(CGF.IntTy, /V=/1);

	LValue LBLVal = CGF.EmitLValueForField(
	Result.TDBase,
	*std::next(Result.KmpTaskTQTyRD->field_begin(), KmpTaskTLowerBound));
	auto *LBVar =
	cast<VarDecl>(cast<DeclRefExpr>(D.getLowerBoundVariable())->getDecl());
	CGF.EmitAnyExprToMem(LBVar->getInit(), LBLVal.getAddress(), LBLVal.getQuals(),
	/IsInitializer=/true);
	LValue UBLVal = CGF.EmitLValueForField(
	Result.TDBase,
	*std::next(Result.KmpTaskTQTyRD->field_begin(), KmpTaskTUpperBound));
	auto *UBVar =
	cast<VarDecl>(cast<DeclRefExpr>(D.getUpperBoundVariable())->getDecl());
	CGF.EmitAnyExprToMem(UBVar->getInit(), UBLVal.getAddress(), UBLVal.getQuals(),
	/IsInitializer=/true);
	LValue StLVal = CGF.EmitLValueForField(
	Result.TDBase,
	*std::next(Result.KmpTaskTQTyRD->field_begin(), KmpTaskTStride));
	auto *StVar =
	cast<VarDecl>(cast<DeclRefExpr>(D.getStrideVariable())->getDecl());
	CGF.EmitAnyExprToMem(StVar->getInit(), StLVal.getAddress(), StLVal.getQuals(),
	/IsInitializer=/true);
	enum { NoSchedule = 0, Grainsize = 1, NumTasks = 2 };
	llvm::Value *TaskArgs[] = {
	UpLoc, ThreadID, Result.NewTask, IfVal, LBLVal.getPointer(),
	UBLVal.getPointer(), CGF.EmitLoadOfScalar(StLVal, SourceLocation()),
	llvm::ConstantInt::getSigned(CGF.IntTy, Data.Nogroup ? 1 : 0),
	llvm::ConstantInt::getSigned(
	CGF.IntTy, Data.Schedule.getPointer()
	? Data.Schedule.getInt() ? NumTasks : Grainsize
	: NoSchedule),
	Data.Schedule.getPointer()
	? CGF.Builder.CreateIntCast(Data.Schedule.getPointer(), CGF.Int64Ty,
	/isSigned=/false)
	: llvm::ConstantInt::get(CGF.Int64Ty, /V=/0),
	Result.TaskDupFn
	? CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(Result.TaskDupFn,
	CGF.VoidPtrTy)
	: llvm::ConstantPointerNull::get(CGF.VoidPtrTy)};
	CGF.EmitRuntimeCall(createRuntimeFunction(OMPRTL__kmpc_taskloop), TaskArgs);
	}

	/// \brief Emit reduction operation for each element of array (required for
	/// array sections) LHS op = RHS.
	/// \param Type Type of array.
	/// \param LHSVar Variable on the left side of the reduction operation
	/// (references element of array in original variable).
	/// \param RHSVar Variable on the right side of the reduction operation
	/// (references element of array in original variable).
	/// \param RedOpGen Generator of reduction operation with use of LHSVar and
	/// RHSVar.
	static void EmitOMPAggregateReduction(
	CodeGenFunction &CGF, QualType Type, const VarDecl *LHSVar,
	const VarDecl *RHSVar,
	const llvm::function_ref<void(CodeGenFunction &CGF, const Expr *,
	const Expr , const Expr )> &RedOpGen,
	const Expr XExpr = nullptr, const Expr EExpr = nullptr,
	const Expr *UpExpr = nullptr) {
	// Perform element-by-element initialization.
	QualType ElementTy;
	Address LHSAddr = CGF.GetAddrOfLocalVar(LHSVar);
	Address RHSAddr = CGF.GetAddrOfLocalVar(RHSVar);

	// Drill down to the base element type on both arrays.
	auto ArrayTy = Type->getAsArrayTypeUnsafe();
	auto NumElements = CGF.emitArrayLength(ArrayTy, ElementTy, LHSAddr);

	auto RHSBegin = RHSAddr.getPointer();
	auto LHSBegin = LHSAddr.getPointer();
	// Cast from pointer to array type to pointer to single element.
	auto LHSEnd = CGF.Builder.CreateGEP(LHSBegin, NumElements);
	// The basic structure here is a while-do loop.
	auto BodyBB = CGF.createBasicBlock("omp.arraycpy.body");
	auto DoneBB = CGF.createBasicBlock("omp.arraycpy.done");
	auto IsEmpty =
	CGF.Builder.CreateICmpEQ(LHSBegin, LHSEnd, "omp.arraycpy.isempty");
	CGF.Builder.CreateCondBr(IsEmpty, DoneBB, BodyBB);

	// Enter the loop body, making that address the current address.
	auto EntryBB = CGF.Builder.GetInsertBlock();
	CGF.EmitBlock(BodyBB);

	CharUnits ElementSize = CGF.getContext().getTypeSizeInChars(ElementTy);

	llvm::PHINode *RHSElementPHI = CGF.Builder.CreatePHI(
	RHSBegin->getType(), 2, "omp.arraycpy.srcElementPast");
	RHSElementPHI->addIncoming(RHSBegin, EntryBB);
	Address RHSElementCurrent =
	Address(RHSElementPHI,
	RHSAddr.getAlignment().alignmentOfArrayElement(ElementSize));

	llvm::PHINode *LHSElementPHI = CGF.Builder.CreatePHI(
	LHSBegin->getType(), 2, "omp.arraycpy.destElementPast");
	LHSElementPHI->addIncoming(LHSBegin, EntryBB);
	Address LHSElementCurrent =
	Address(LHSElementPHI,
	LHSAddr.getAlignment().alignmentOfArrayElement(ElementSize));

	// Emit copy.
	CodeGenFunction::OMPPrivateScope Scope(CGF);
	Scope.addPrivate(LHSVar, [=]() -> Address { return LHSElementCurrent; });
	Scope.addPrivate(RHSVar, [=]() -> Address { return RHSElementCurrent; });
	Scope.Privatize();
	RedOpGen(CGF, XExpr, EExpr, UpExpr);
	Scope.ForceCleanup();

	// Shift the address forward by one element.
	auto LHSElementNext = CGF.Builder.CreateConstGEP1_32(
	LHSElementPHI, /Idx0=/1, "omp.arraycpy.dest.element");
	auto RHSElementNext = CGF.Builder.CreateConstGEP1_32(
	RHSElementPHI, /Idx0=/1, "omp.arraycpy.src.element");
	// Check whether we've reached the end.
	auto Done =
	CGF.Builder.CreateICmpEQ(LHSElementNext, LHSEnd, "omp.arraycpy.done");
	CGF.Builder.CreateCondBr(Done, DoneBB, BodyBB);
	LHSElementPHI->addIncoming(LHSElementNext, CGF.Builder.GetInsertBlock());
	RHSElementPHI->addIncoming(RHSElementNext, CGF.Builder.GetInsertBlock());

	// Done.
	CGF.EmitBlock(DoneBB, /IsFinished=/true);
	}

	/// Emit reduction combiner. If the combiner is a simple expression emit it as
	/// is, otherwise consider it as combiner of UDR decl and emit it as a call of
	/// UDR combiner function.
	static void emitReductionCombiner(CodeGenFunction &CGF,
	const Expr *ReductionOp) {
	if (auto *CE = dyn_cast<CallExpr>(ReductionOp))
	if (auto *OVE = dyn_cast<OpaqueValueExpr>(CE->getCallee()))
	if (auto *DRE =
	dyn_cast<DeclRefExpr>(OVE->getSourceExpr()->IgnoreImpCasts()))
	if (auto *DRD = dyn_cast<OMPDeclareReductionDecl>(DRE->getDecl())) {
	std::pair<llvm::Function , llvm::Function > Reduction =
	CGF.CGM.getOpenMPRuntime().getUserDefinedReduction(DRD);
	RValue Func = RValue::get(Reduction.first);
	CodeGenFunction::OpaqueValueMapping Map(CGF, OVE, Func);
	CGF.EmitIgnoredExpr(ReductionOp);
	return;
	}
	CGF.EmitIgnoredExpr(ReductionOp);
	}

	static llvm::Value *emitReductionFunction(CodeGenModule &CGM,
	llvm::Type *ArgsType,
	ArrayRef<const Expr *> Privates,
	ArrayRef<const Expr *> LHSExprs,
	ArrayRef<const Expr *> RHSExprs,
	ArrayRef<const Expr *> ReductionOps) {
	auto &C = CGM.getContext();

	// void reduction_func(void LHSArg, void RHSArg);
	FunctionArgList Args;
	ImplicitParamDecl LHSArg(C, /DC=/nullptr, SourceLocation(), /Id=/nullptr,
	C.VoidPtrTy);
	ImplicitParamDecl RHSArg(C, /DC=/nullptr, SourceLocation(), /Id=/nullptr,
	C.VoidPtrTy);
	Args.push_back(&LHSArg);
	Args.push_back(&RHSArg);
	auto &CGFI = CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
	auto *Fn = llvm::Function::Create(
	CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
	".omp.reduction.reduction_func", &CGM.getModule());
	CGM.SetInternalFunctionAttributes(/D=/nullptr, Fn, CGFI);
	CodeGenFunction CGF(CGM);
	CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args);

	// Dst = (void*[n])(LHSArg);
	// Src = (void*[n])(RHSArg);
	Address LHS(CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
	CGF.Builder.CreateLoad(CGF.GetAddrOfLocalVar(&LHSArg)),
	ArgsType), CGF.getPointerAlign());
	Address RHS(CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
	CGF.Builder.CreateLoad(CGF.GetAddrOfLocalVar(&RHSArg)),
	ArgsType), CGF.getPointerAlign());

	// ...
	// (Type<i>)lhs[i] = RedOp<i>((Type<i>)lhs[i], (Type<i>)rhs[i]);
	// ...
	CodeGenFunction::OMPPrivateScope Scope(CGF);
	auto IPriv = Privates.begin();
	unsigned Idx = 0;
	for (unsigned I = 0, E = ReductionOps.size(); I < E; ++I, ++IPriv, ++Idx) {
	auto RHSVar = cast<VarDecl>(cast<DeclRefExpr>(RHSExprs[I])->getDecl());
	Scope.addPrivate(RHSVar, [&]() -> Address {
	return emitAddrOfVarFromArray(CGF, RHS, Idx, RHSVar);
	});
	auto LHSVar = cast<VarDecl>(cast<DeclRefExpr>(LHSExprs[I])->getDecl());
	Scope.addPrivate(LHSVar, [&]() -> Address {
	return emitAddrOfVarFromArray(CGF, LHS, Idx, LHSVar);
	});
	QualType PrivTy = (*IPriv)->getType();
	if (PrivTy->isVariablyModifiedType()) {
	// Get array size and emit VLA type.
	++Idx;
	Address Elem =
	CGF.Builder.CreateConstArrayGEP(LHS, Idx, CGF.getPointerSize());
	llvm::Value *Ptr = CGF.Builder.CreateLoad(Elem);
	auto *VLA = CGF.getContext().getAsVariableArrayType(PrivTy);
	auto *OVE = cast<OpaqueValueExpr>(VLA->getSizeExpr());
	CodeGenFunction::OpaqueValueMapping OpaqueMap(
	CGF, OVE, RValue::get(CGF.Builder.CreatePtrToInt(Ptr, CGF.SizeTy)));
	CGF.EmitVariablyModifiedType(PrivTy);
	}
	}
	Scope.Privatize();
	IPriv = Privates.begin();
	auto ILHS = LHSExprs.begin();
	auto IRHS = RHSExprs.begin();
	for (auto *E : ReductionOps) {
	if ((*IPriv)->getType()->isArrayType()) {
	// Emit reduction for array section.
	auto LHSVar = cast<VarDecl>(cast<DeclRefExpr>(ILHS)->getDecl());
	auto RHSVar = cast<VarDecl>(cast<DeclRefExpr>(IRHS)->getDecl());
	EmitOMPAggregateReduction(
	CGF, (*IPriv)->getType(), LHSVar, RHSVar,
	[=](CodeGenFunction &CGF, const Expr , const Expr , const Expr *) {
	emitReductionCombiner(CGF, E);
	});
	} else
	// Emit reduction for array subscript or single variable.
	emitReductionCombiner(CGF, E);
	++IPriv;
	++ILHS;
	++IRHS;
	}
	Scope.ForceCleanup();
	CGF.FinishFunction();
	return Fn;
	}

	static void emitSingleReductionCombiner(CodeGenFunction &CGF,
	const Expr *ReductionOp,
	const Expr *PrivateRef,
	const DeclRefExpr *LHS,
	const DeclRefExpr *RHS) {
	if (PrivateRef->getType()->isArrayType()) {
	// Emit reduction for array section.
	auto *LHSVar = cast<VarDecl>(LHS->getDecl());
	auto *RHSVar = cast<VarDecl>(RHS->getDecl());
	EmitOMPAggregateReduction(
	CGF, PrivateRef->getType(), LHSVar, RHSVar,
	[=](CodeGenFunction &CGF, const Expr , const Expr , const Expr *) {
	emitReductionCombiner(CGF, ReductionOp);
	});
	} else
	// Emit reduction for array subscript or single variable.
	emitReductionCombiner(CGF, ReductionOp);
	}

	void CGOpenMPRuntime::emitReduction(CodeGenFunction &CGF, SourceLocation Loc,
	ArrayRef<const Expr *> Privates,
	ArrayRef<const Expr *> LHSExprs,
	ArrayRef<const Expr *> RHSExprs,
	ArrayRef<const Expr *> ReductionOps,
	bool WithNowait, bool SimpleReduction) {
	if (!CGF.HaveInsertPoint())
	return;
	// Next code should be emitted for reduction:
	//
	// static kmp_critical_name lock = { 0 };
	//
	// void reduce_func(void lhs[<n>], void rhs[<n>]) {
	// (Type0)lhs[0] = ReductionOperation0((Type0)lhs[0], (Type0)rhs[0]);
	// ...
	// (Type<n>-1)lhs[<n>-1] = ReductionOperation<n>-1((Type<n>-1)lhs[<n>-1],
	// (Type<n>-1)rhs[<n>-1]);
	// }
	//
	// ...
	// void *RedList[<n>] = {&<RHSExprs>[0], ..., &<RHSExprs>[<n>-1]};
	// switch (__kmpc_reduce{_nowait}(<loc>, <gtid>, <n>, sizeof(RedList),
	// RedList, reduce_func, &<lock>)) {
	// case 1:
	// ...
	// <LHSExprs>[i] = RedOp<i>(<LHSExprs>[i], <RHSExprs>[i]);
	// ...
	// __kmpc_end_reduce{_nowait}(<loc>, <gtid>, &<lock>);
	// break;
	// case 2:
	// ...
	// Atomic(<LHSExprs>[i] = RedOp<i>(<LHSExprs>[i], <RHSExprs>[i]));
	// ...
	// [__kmpc_end_reduce(<loc>, <gtid>, &<lock>);]
	// break;
	// default:;
	// }
	//
	// if SimpleReduction is true, only the next code is generated:
	// ...
	// <LHSExprs>[i] = RedOp<i>(<LHSExprs>[i], <RHSExprs>[i]);
	// ...

	auto &C = CGM.getContext();

	if (SimpleReduction) {
	CodeGenFunction::RunCleanupsScope Scope(CGF);
	auto IPriv = Privates.begin();
	auto ILHS = LHSExprs.begin();
	auto IRHS = RHSExprs.begin();
	for (auto *E : ReductionOps) {
	emitSingleReductionCombiner(CGF, E, IPriv, cast<DeclRefExpr>(ILHS),
	cast<DeclRefExpr>(*IRHS));
	++IPriv;
	++ILHS;
	++IRHS;
	}
	return;
	}

	// 1. Build a list of reduction variables.
	// void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
	auto Size = RHSExprs.size();
	for (auto *E : Privates) {
	if (E->getType()->isVariablyModifiedType())
	// Reserve place for array size.
	++Size;
	}
	llvm::APInt ArraySize(/unsigned int numBits=/32, Size);
	QualType ReductionArrayTy =
	C.getConstantArrayType(C.VoidPtrTy, ArraySize, ArrayType::Normal,
	/IndexTypeQuals=/0);
	Address ReductionList =
	CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list");
	auto IPriv = Privates.begin();
	unsigned Idx = 0;
	for (unsigned I = 0, E = RHSExprs.size(); I < E; ++I, ++IPriv, ++Idx) {
	Address Elem =
	CGF.Builder.CreateConstArrayGEP(ReductionList, Idx, CGF.getPointerSize());
	CGF.Builder.CreateStore(
	CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
	CGF.EmitLValue(RHSExprs[I]).getPointer(), CGF.VoidPtrTy),
	Elem);
	if ((*IPriv)->getType()->isVariablyModifiedType()) {
	// Store array size.
	++Idx;
	Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx,
	CGF.getPointerSize());
	llvm::Value *Size = CGF.Builder.CreateIntCast(
	CGF.getVLASize(
	CGF.getContext().getAsVariableArrayType((*IPriv)->getType()))
	.first,
	CGF.SizeTy, /isSigned=/false);
	CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(Size, CGF.VoidPtrTy),
	Elem);
	}
	}

	// 2. Emit reduce_func().
	auto *ReductionFn = emitReductionFunction(
	CGM, CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo(), Privates,
	LHSExprs, RHSExprs, ReductionOps);

	// 3. Create static kmp_critical_name lock = { 0 };
	auto *Lock = getCriticalRegionLock(".reduction");

	// 4. Build res = __kmpc_reduce{_nowait}(<loc>, <gtid>, <n>, sizeof(RedList),
	// RedList, reduce_func, &<lock>);
	auto *IdentTLoc = emitUpdateLocation(CGF, Loc, OMP_ATOMIC_REDUCE);
	auto *ThreadId = getThreadID(CGF, Loc);
	auto *ReductionArrayTySize = CGF.getTypeSize(ReductionArrayTy);
	auto *RL = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
	ReductionList.getPointer(), CGF.VoidPtrTy);
	llvm::Value *Args[] = {
	IdentTLoc, // ident_t *<loc>
	ThreadId, // i32 <gtid>
	CGF.Builder.getInt32(RHSExprs.size()), // i32 <n>
	ReductionArrayTySize, // size_type sizeof(RedList)
	RL, // void *RedList
	ReductionFn, // void () (void , void *) <reduce_func>
	Lock // kmp_critical_name *&<lock>
	};
	auto Res = CGF.EmitRuntimeCall(
	createRuntimeFunction(WithNowait ? OMPRTL__kmpc_reduce_nowait
	: OMPRTL__kmpc_reduce),
	Args);

	// 5. Build switch(res)
	auto *DefaultBB = CGF.createBasicBlock(".omp.reduction.default");
	auto SwInst = CGF.Builder.CreateSwitch(Res, DefaultBB, /NumCases=*/2);

	// 6. Build case 1:
	// ...
	// <LHSExprs>[i] = RedOp<i>(<LHSExprs>[i], <RHSExprs>[i]);
	// ...
	// __kmpc_end_reduce{_nowait}(<loc>, <gtid>, &<lock>);
	// break;
	auto *Case1BB = CGF.createBasicBlock(".omp.reduction.case1");
	SwInst->addCase(CGF.Builder.getInt32(1), Case1BB);
	CGF.EmitBlock(Case1BB);

	// Add emission of __kmpc_end_reduce{_nowait}(<loc>, <gtid>, &<lock>);
	llvm::Value *EndArgs[] = {
	IdentTLoc, // ident_t *<loc>
	ThreadId, // i32 <gtid>
	Lock // kmp_critical_name *&<lock>
	};
	auto &&CodeGen = [&Privates, &LHSExprs, &RHSExprs, &ReductionOps](
	CodeGenFunction &CGF, PrePostActionTy &Action) {
	auto IPriv = Privates.begin();
	auto ILHS = LHSExprs.begin();
	auto IRHS = RHSExprs.begin();
	for (auto *E : ReductionOps) {
	emitSingleReductionCombiner(CGF, E, IPriv, cast<DeclRefExpr>(ILHS),
	cast<DeclRefExpr>(*IRHS));
	++IPriv;
	++ILHS;
	++IRHS;
	}
	};
	RegionCodeGenTy RCG(CodeGen);
	CommonActionTy Action(
	nullptr, llvm::None,
	createRuntimeFunction(WithNowait ? OMPRTL__kmpc_end_reduce_nowait
	: OMPRTL__kmpc_end_reduce),
	EndArgs);
	RCG.setAction(Action);
	RCG(CGF);

	CGF.EmitBranch(DefaultBB);

	// 7. Build case 2:
	// ...
	// Atomic(<LHSExprs>[i] = RedOp<i>(<LHSExprs>[i], <RHSExprs>[i]));
	// ...
	// break;
	auto *Case2BB = CGF.createBasicBlock(".omp.reduction.case2");
	SwInst->addCase(CGF.Builder.getInt32(2), Case2BB);
	CGF.EmitBlock(Case2BB);

	auto &&AtomicCodeGen = [Loc, &Privates, &LHSExprs, &RHSExprs, &ReductionOps](
	CodeGenFunction &CGF, PrePostActionTy &Action) {
	auto ILHS = LHSExprs.begin();
	auto IRHS = RHSExprs.begin();
	auto IPriv = Privates.begin();
	for (auto *E : ReductionOps) {
	const Expr *XExpr = nullptr;
	const Expr *EExpr = nullptr;
	const Expr *UpExpr = nullptr;
	BinaryOperatorKind BO = BO_Comma;
	if (auto *BO = dyn_cast<BinaryOperator>(E)) {
	if (BO->getOpcode() == BO_Assign) {
	XExpr = BO->getLHS();
	UpExpr = BO->getRHS();
	}
	}
	// Try to emit update expression as a simple atomic.
	auto *RHSExpr = UpExpr;
	if (RHSExpr) {
	// Analyze RHS part of the whole expression.
	if (auto *ACO = dyn_cast<AbstractConditionalOperator>(
	RHSExpr->IgnoreParenImpCasts())) {
	// If this is a conditional operator, analyze its condition for
	// min/max reduction operator.
	RHSExpr = ACO->getCond();
	}
	if (auto *BORHS =
	dyn_cast<BinaryOperator>(RHSExpr->IgnoreParenImpCasts())) {
	EExpr = BORHS->getRHS();
	BO = BORHS->getOpcode();
	}
	}
	if (XExpr) {
	auto VD = cast<VarDecl>(cast<DeclRefExpr>(ILHS)->getDecl());
	auto &&AtomicRedGen = [BO, VD, IPriv,
	Loc](CodeGenFunction &CGF, const Expr *XExpr,
	const Expr EExpr, const Expr UpExpr) {
	LValue X = CGF.EmitLValue(XExpr);
	RValue E;
	if (EExpr)
	E = CGF.EmitAnyExpr(EExpr);
	CGF.EmitOMPAtomicSimpleUpdateExpr(
	X, E, BO, /IsXLHSInRHSPart=/true,
	llvm::AtomicOrdering::Monotonic, Loc,
	[&CGF, UpExpr, VD, IPriv, Loc](RValue XRValue) {
	CodeGenFunction::OMPPrivateScope PrivateScope(CGF);
	PrivateScope.addPrivate(
	VD, [&CGF, VD, XRValue, Loc]() -> Address {
	Address LHSTemp = CGF.CreateMemTemp(VD->getType());
	CGF.emitOMPSimpleStore(
	CGF.MakeAddrLValue(LHSTemp, VD->getType()), XRValue,
	VD->getType().getNonReferenceType(), Loc);
	return LHSTemp;
	});
	(void)PrivateScope.Privatize();
	return CGF.EmitAnyExpr(UpExpr);
	});
	};
	if ((*IPriv)->getType()->isArrayType()) {
	// Emit atomic reduction for array section.
	auto RHSVar = cast<VarDecl>(cast<DeclRefExpr>(IRHS)->getDecl());
	EmitOMPAggregateReduction(CGF, (*IPriv)->getType(), VD, RHSVar,
	AtomicRedGen, XExpr, EExpr, UpExpr);
	} else
	// Emit atomic reduction for array subscript or single variable.
	AtomicRedGen(CGF, XExpr, EExpr, UpExpr);
	} else {
	// Emit as a critical region.
	auto &&CritRedGen = [E, Loc](CodeGenFunction &CGF, const Expr *,
	const Expr , const Expr ) {
	auto &RT = CGF.CGM.getOpenMPRuntime();
	RT.emitCriticalRegion(
	CGF, ".atomic_reduction",
	[=](CodeGenFunction &CGF, PrePostActionTy &Action) {
	Action.Enter(CGF);
	emitReductionCombiner(CGF, E);
	},
	Loc);
	};
	if ((*IPriv)->getType()->isArrayType()) {
	auto LHSVar = cast<VarDecl>(cast<DeclRefExpr>(ILHS)->getDecl());
	auto RHSVar = cast<VarDecl>(cast<DeclRefExpr>(IRHS)->getDecl());
	EmitOMPAggregateReduction(CGF, (*IPriv)->getType(), LHSVar, RHSVar,
	CritRedGen);
	} else
	CritRedGen(CGF, nullptr, nullptr, nullptr);
	}
	++ILHS;
	++IRHS;
	++IPriv;
	}
	};
	RegionCodeGenTy AtomicRCG(AtomicCodeGen);
	if (!WithNowait) {
	// Add emission of __kmpc_end_reduce(<loc>, <gtid>, &<lock>);
	llvm::Value *EndArgs[] = {
	IdentTLoc, // ident_t *<loc>
	ThreadId, // i32 <gtid>
	Lock // kmp_critical_name *&<lock>
	};
	CommonActionTy Action(nullptr, llvm::None,
	createRuntimeFunction(OMPRTL__kmpc_end_reduce),
	EndArgs);
	AtomicRCG.setAction(Action);
	AtomicRCG(CGF);
	} else
	AtomicRCG(CGF);

	CGF.EmitBranch(DefaultBB);
	CGF.EmitBlock(DefaultBB, /IsFinished=/true);
	}

	void CGOpenMPRuntime::emitTaskwaitCall(CodeGenFunction &CGF,
	SourceLocation Loc) {
	if (!CGF.HaveInsertPoint())
	return;
	// Build call kmp_int32 __kmpc_omp_taskwait(ident_t *loc, kmp_int32
	// global_tid);
	llvm::Value *Args[] = {emitUpdateLocation(CGF, Loc), getThreadID(CGF, Loc)};
	// Ignore return result until untied tasks are supported.
	CGF.EmitRuntimeCall(createRuntimeFunction(OMPRTL__kmpc_omp_taskwait), Args);
	if (auto *Region = dyn_cast_or_null<CGOpenMPRegionInfo>(CGF.CapturedStmtInfo))
	Region->emitUntiedSwitch(CGF);
	}

	void CGOpenMPRuntime::emitInlinedDirective(CodeGenFunction &CGF,
	OpenMPDirectiveKind InnerKind,
	const RegionCodeGenTy &CodeGen,
	bool HasCancel) {
	if (!CGF.HaveInsertPoint())
	return;
	InlinedOpenMPRegionRAII Region(CGF, CodeGen, InnerKind, HasCancel);
	CGF.CapturedStmtInfo->EmitBody(CGF, /S=/nullptr);
	}

	namespace {
	enum RTCancelKind {
	CancelNoreq = 0,
	CancelParallel = 1,
	CancelLoop = 2,
	CancelSections = 3,
	CancelTaskgroup = 4
	};
	} // anonymous namespace

	static RTCancelKind getCancellationKind(OpenMPDirectiveKind CancelRegion) {
	RTCancelKind CancelKind = CancelNoreq;
	if (CancelRegion == OMPD_parallel)
	CancelKind = CancelParallel;
	else if (CancelRegion == OMPD_for)
	CancelKind = CancelLoop;
	else if (CancelRegion == OMPD_sections)
	CancelKind = CancelSections;
	else {
	assert(CancelRegion == OMPD_taskgroup);
	CancelKind = CancelTaskgroup;
	}
	return CancelKind;
	}

	void CGOpenMPRuntime::emitCancellationPointCall(
	CodeGenFunction &CGF, SourceLocation Loc,
	OpenMPDirectiveKind CancelRegion) {
	if (!CGF.HaveInsertPoint())
	return;
	// Build call kmp_int32 __kmpc_cancellationpoint(ident_t *loc, kmp_int32
	// global_tid, kmp_int32 cncl_kind);
	if (auto *OMPRegionInfo =
	dyn_cast_or_null<CGOpenMPRegionInfo>(CGF.CapturedStmtInfo)) {
	- if (OMPRegionInfo->hasCancel()) {
	+ // For 'cancellation point taskgroup', the task region info may not have a
	+ // cancel. This may instead happen in another adjacent task.
	+ if (CancelRegion == OMPD_taskgroup \|\| OMPRegionInfo->hasCancel()) {
	llvm::Value *Args[] = {
	emitUpdateLocation(CGF, Loc), getThreadID(CGF, Loc),
	CGF.Builder.getInt32(getCancellationKind(CancelRegion))};
	// Ignore return result until untied tasks are supported.
	auto *Result = CGF.EmitRuntimeCall(
	createRuntimeFunction(OMPRTL__kmpc_cancellationpoint), Args);
	// if (__kmpc_cancellationpoint()) {
	// exit from construct;
	// }
	auto *ExitBB = CGF.createBasicBlock(".cancel.exit");
	auto *ContBB = CGF.createBasicBlock(".cancel.continue");
	auto *Cmp = CGF.Builder.CreateIsNotNull(Result);
	CGF.Builder.CreateCondBr(Cmp, ExitBB, ContBB);
	CGF.EmitBlock(ExitBB);
	// exit from construct;
	auto CancelDest =
	CGF.getOMPCancelDestination(OMPRegionInfo->getDirectiveKind());
	CGF.EmitBranchThroughCleanup(CancelDest);
	CGF.EmitBlock(ContBB, /IsFinished=/true);
	}
	}
	}

	void CGOpenMPRuntime::emitCancelCall(CodeGenFunction &CGF, SourceLocation Loc,
	const Expr *IfCond,
	OpenMPDirectiveKind CancelRegion) {
	if (!CGF.HaveInsertPoint())
	return;
	// Build call kmp_int32 __kmpc_cancel(ident_t *loc, kmp_int32 global_tid,
	// kmp_int32 cncl_kind);
	if (auto *OMPRegionInfo =
	dyn_cast_or_null<CGOpenMPRegionInfo>(CGF.CapturedStmtInfo)) {
	auto &&ThenGen = [Loc, CancelRegion, OMPRegionInfo](CodeGenFunction &CGF,
	PrePostActionTy &) {
	auto &RT = CGF.CGM.getOpenMPRuntime();
	llvm::Value *Args[] = {
	RT.emitUpdateLocation(CGF, Loc), RT.getThreadID(CGF, Loc),
	CGF.Builder.getInt32(getCancellationKind(CancelRegion))};
	// Ignore return result until untied tasks are supported.
	auto *Result = CGF.EmitRuntimeCall(
	RT.createRuntimeFunction(OMPRTL__kmpc_cancel), Args);
	// if (__kmpc_cancel()) {
	// exit from construct;
	// }
	auto *ExitBB = CGF.createBasicBlock(".cancel.exit");
	auto *ContBB = CGF.createBasicBlock(".cancel.continue");
	auto *Cmp = CGF.Builder.CreateIsNotNull(Result);
	CGF.Builder.CreateCondBr(Cmp, ExitBB, ContBB);
	CGF.EmitBlock(ExitBB);
	// exit from construct;
	auto CancelDest =
	CGF.getOMPCancelDestination(OMPRegionInfo->getDirectiveKind());
	CGF.EmitBranchThroughCleanup(CancelDest);
	CGF.EmitBlock(ContBB, /IsFinished=/true);
	};
	if (IfCond)
	emitOMPIfClause(CGF, IfCond, ThenGen,
	[](CodeGenFunction &, PrePostActionTy &) {});
	else {
	RegionCodeGenTy ThenRCG(ThenGen);
	ThenRCG(CGF);
	}
	}
	}

	/// \brief Obtain information that uniquely identifies a target entry. This
	/// consists of the file and device IDs as well as line number associated with
	/// the relevant entry source location.
	static void getTargetEntryUniqueInfo(ASTContext &C, SourceLocation Loc,
	unsigned &DeviceID, unsigned &FileID,
	unsigned &LineNum) {

	auto &SM = C.getSourceManager();

	// The loc should be always valid and have a file ID (the user cannot use
	// #pragma directives in macros)

	assert(Loc.isValid() && "Source location is expected to be always valid.");
	assert(Loc.isFileID() && "Source location is expected to refer to a file.");

	PresumedLoc PLoc = SM.getPresumedLoc(Loc);
	assert(PLoc.isValid() && "Source location is expected to be always valid.");

	llvm::sys::fs::UniqueID ID;
	if (llvm::sys::fs::getUniqueID(PLoc.getFilename(), ID))
	llvm_unreachable("Source file with target region no longer exists!");

	DeviceID = ID.getDevice();
	FileID = ID.getFile();
	LineNum = PLoc.getLine();
	}

	void CGOpenMPRuntime::emitTargetOutlinedFunction(
	const OMPExecutableDirective &D, StringRef ParentName,
	llvm::Function &OutlinedFn, llvm::Constant &OutlinedFnID,
	bool IsOffloadEntry, const RegionCodeGenTy &CodeGen) {
	assert(!ParentName.empty() && "Invalid target region parent name!");

	emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
	IsOffloadEntry, CodeGen);
	}

	void CGOpenMPRuntime::emitTargetOutlinedFunctionHelper(
	const OMPExecutableDirective &D, StringRef ParentName,
	llvm::Function &OutlinedFn, llvm::Constant &OutlinedFnID,
	bool IsOffloadEntry, const RegionCodeGenTy &CodeGen) {
	// Create a unique name for the entry function using the source location
	// information of the current target region. The name will be something like:
	//
	// __omp_offloading_DD_FFFF_PP_lBB
	//
	// where DD_FFFF is an ID unique to the file (device and file IDs), PP is the
	// mangled name of the function that encloses the target region and BB is the
	// line number of the target region.

	unsigned DeviceID;
	unsigned FileID;
	unsigned Line;
	getTargetEntryUniqueInfo(CGM.getContext(), D.getLocStart(), DeviceID, FileID,
	Line);
	SmallString<64> EntryFnName;
	{
	llvm::raw_svector_ostream OS(EntryFnName);
	OS << "__omp_offloading" << llvm::format("_%x", DeviceID)
	<< llvm::format("_%x_", FileID) << ParentName << "_l" << Line;
	}

	const CapturedStmt &CS = *cast<CapturedStmt>(D.getAssociatedStmt());

	CodeGenFunction CGF(CGM, true);
	CGOpenMPTargetRegionInfo CGInfo(CS, CodeGen, EntryFnName);
	CodeGenFunction::CGCapturedStmtRAII CapInfoRAII(CGF, &CGInfo);

	OutlinedFn = CGF.GenerateOpenMPCapturedStmtFunction(CS);

	// If this target outline function is not an offload entry, we don't need to
	// register it.
	if (!IsOffloadEntry)
	return;

	// The target region ID is used by the runtime library to identify the current
	// target region, so it only has to be unique and not necessarily point to
	// anything. It could be the pointer to the outlined function that implements
	// the target region, but we aren't using that so that the compiler doesn't
	// need to keep that, and could therefore inline the host function if proven
	// worthwhile during optimization. In the other hand, if emitting code for the
	// device, the ID has to be the function address so that it can retrieved from
	// the offloading entry and launched by the runtime library. We also mark the
	// outlined function to have external linkage in case we are emitting code for
	// the device, because these functions will be entry points to the device.

	if (CGM.getLangOpts().OpenMPIsDevice) {
	OutlinedFnID = llvm::ConstantExpr::getBitCast(OutlinedFn, CGM.Int8PtrTy);
	OutlinedFn->setLinkage(llvm::GlobalValue::ExternalLinkage);
	} else
	OutlinedFnID = new llvm::GlobalVariable(
	CGM.getModule(), CGM.Int8Ty, /isConstant=/true,
	llvm::GlobalValue::PrivateLinkage,
	llvm::Constant::getNullValue(CGM.Int8Ty), ".omp_offload.region_id");

	// Register the information for the entry associated with this target region.
	OffloadEntriesInfoManager.registerTargetRegionEntryInfo(
	DeviceID, FileID, ParentName, Line, OutlinedFn, OutlinedFnID,
	/Flags=/0);
	}

	/// discard all CompoundStmts intervening between two constructs
	static const Stmt ignoreCompoundStmts(const Stmt Body) {
	while (auto *CS = dyn_cast_or_null<CompoundStmt>(Body))
	Body = CS->body_front();

	return Body;
	}

	/// \brief Emit the num_teams clause of an enclosed teams directive at the
	/// target region scope. If there is no teams directive associated with the
	/// target directive, or if there is no num_teams clause associated with the
	/// enclosed teams directive, return nullptr.
	static llvm::Value *
	emitNumTeamsClauseForTargetDirective(CGOpenMPRuntime &OMPRuntime,
	CodeGenFunction &CGF,
	const OMPExecutableDirective &D) {

	assert(!CGF.getLangOpts().OpenMPIsDevice && "Clauses associated with the "
	"teams directive expected to be "
	"emitted only for the host!");

	// FIXME: For the moment we do not support combined directives with target and
	// teams, so we do not expect to get any num_teams clause in the provided
	// directive. Once we support that, this assertion can be replaced by the
	// actual emission of the clause expression.
	assert(D.getSingleClause<OMPNumTeamsClause>() == nullptr &&
	"Not expecting clause in directive.");

	// If the current target region has a teams region enclosed, we need to get
	// the number of teams to pass to the runtime function call. This is done
	// by generating the expression in a inlined region. This is required because
	// the expression is captured in the enclosing target environment when the
	// teams directive is not combined with target.

	const CapturedStmt &CS = *cast<CapturedStmt>(D.getAssociatedStmt());

	// FIXME: Accommodate other combined directives with teams when they become
	// available.
	if (auto *TeamsDir = dyn_cast_or_null<OMPTeamsDirective>(
	ignoreCompoundStmts(CS.getCapturedStmt()))) {
	if (auto *NTE = TeamsDir->getSingleClause<OMPNumTeamsClause>()) {
	CGOpenMPInnerExprInfo CGInfo(CGF, CS);
	CodeGenFunction::CGCapturedStmtRAII CapInfoRAII(CGF, &CGInfo);
	llvm::Value *NumTeams = CGF.EmitScalarExpr(NTE->getNumTeams());
	return CGF.Builder.CreateIntCast(NumTeams, CGF.Int32Ty,
	/IsSigned=/true);
	}

	// If we have an enclosed teams directive but no num_teams clause we use
	// the default value 0.
	return CGF.Builder.getInt32(0);
	}

	// No teams associated with the directive.
	return nullptr;
	}

	/// \brief Emit the thread_limit clause of an enclosed teams directive at the
	/// target region scope. If there is no teams directive associated with the
	/// target directive, or if there is no thread_limit clause associated with the
	/// enclosed teams directive, return nullptr.
	static llvm::Value *
	emitThreadLimitClauseForTargetDirective(CGOpenMPRuntime &OMPRuntime,
	CodeGenFunction &CGF,
	const OMPExecutableDirective &D) {

	assert(!CGF.getLangOpts().OpenMPIsDevice && "Clauses associated with the "
	"teams directive expected to be "
	"emitted only for the host!");

	// FIXME: For the moment we do not support combined directives with target and
	// teams, so we do not expect to get any thread_limit clause in the provided
	// directive. Once we support that, this assertion can be replaced by the
	// actual emission of the clause expression.
	assert(D.getSingleClause<OMPThreadLimitClause>() == nullptr &&
	"Not expecting clause in directive.");

	// If the current target region has a teams region enclosed, we need to get
	// the thread limit to pass to the runtime function call. This is done
	// by generating the expression in a inlined region. This is required because
	// the expression is captured in the enclosing target environment when the
	// teams directive is not combined with target.

	const CapturedStmt &CS = *cast<CapturedStmt>(D.getAssociatedStmt());

	// FIXME: Accommodate other combined directives with teams when they become
	// available.
	if (auto *TeamsDir = dyn_cast_or_null<OMPTeamsDirective>(
	ignoreCompoundStmts(CS.getCapturedStmt()))) {
	if (auto *TLE = TeamsDir->getSingleClause<OMPThreadLimitClause>()) {
	CGOpenMPInnerExprInfo CGInfo(CGF, CS);
	CodeGenFunction::CGCapturedStmtRAII CapInfoRAII(CGF, &CGInfo);
	llvm::Value *ThreadLimit = CGF.EmitScalarExpr(TLE->getThreadLimit());
	return CGF.Builder.CreateIntCast(ThreadLimit, CGF.Int32Ty,
	/IsSigned=/true);
	}

	// If we have an enclosed teams directive but no thread_limit clause we use
	// the default value 0.
	return CGF.Builder.getInt32(0);
	}

	// No teams associated with the directive.
	return nullptr;
	}

	namespace {
	// \brief Utility to handle information from clauses associated with a given
	// construct that use mappable expressions (e.g. 'map' clause, 'to' clause).
	// It provides a convenient interface to obtain the information and generate
	// code for that information.
	class MappableExprsHandler {
	public:
	/// \brief Values for bit flags used to specify the mapping type for
	/// offloading.
	enum OpenMPOffloadMappingFlags {
	/// \brief Allocate memory on the device and move data from host to device.
	OMP_MAP_TO = 0x01,
	/// \brief Allocate memory on the device and move data from device to host.
	OMP_MAP_FROM = 0x02,
	/// \brief Always perform the requested mapping action on the element, even
	/// if it was already mapped before.
	OMP_MAP_ALWAYS = 0x04,
	/// \brief Delete the element from the device environment, ignoring the
	/// current reference count associated with the element.
	OMP_MAP_DELETE = 0x08,
	/// \brief The element being mapped is a pointer, therefore the pointee
	/// should be mapped as well.
	OMP_MAP_IS_PTR = 0x10,
	/// \brief This flags signals that an argument is the first one relating to
	/// a map/private clause expression. For some cases a single
	/// map/privatization results in multiple arguments passed to the runtime
	/// library.
	OMP_MAP_FIRST_REF = 0x20,
	/// \brief Signal that the runtime library has to return the device pointer
	/// in the current position for the data being mapped.
	OMP_MAP_RETURN_PTR = 0x40,
	/// \brief This flag signals that the reference being passed is a pointer to
	/// private data.
	OMP_MAP_PRIVATE_PTR = 0x80,
	/// \brief Pass the element to the device by value.
	OMP_MAP_PRIVATE_VAL = 0x100,
	};

	/// Class that associates information with a base pointer to be passed to the
	/// runtime library.
	class BasePointerInfo {
	/// The base pointer.
	llvm::Value *Ptr = nullptr;
	/// The base declaration that refers to this device pointer, or null if
	/// there is none.
	const ValueDecl *DevPtrDecl = nullptr;

	public:
	BasePointerInfo(llvm::Value Ptr, const ValueDecl DevPtrDecl = nullptr)
	: Ptr(Ptr), DevPtrDecl(DevPtrDecl) {}
	llvm::Value operator() const { return Ptr; }
	const ValueDecl *getDevicePtrDecl() const { return DevPtrDecl; }
	void setDevicePtrDecl(const ValueDecl *D) { DevPtrDecl = D; }
	};

	typedef SmallVector<BasePointerInfo, 16> MapBaseValuesArrayTy;
	typedef SmallVector<llvm::Value *, 16> MapValuesArrayTy;
	typedef SmallVector<unsigned, 16> MapFlagsArrayTy;

	private:
	/// \brief Directive from where the map clauses were extracted.
	const OMPExecutableDirective &CurDir;

	/// \brief Function the directive is being generated for.
	CodeGenFunction &CGF;

	/// \brief Set of all first private variables in the current directive.
	llvm::SmallPtrSet<const VarDecl *, 8> FirstPrivateDecls;

	/// Map between device pointer declarations and their expression components.
	/// The key value for declarations in 'this' is null.
	llvm::DenseMap<
	const ValueDecl *,
	SmallVector<OMPClauseMappableExprCommon::MappableExprComponentListRef, 4>>
	DevPointersMap;

	llvm::Value getExprTypeSize(const Expr E) const {
	auto ExprTy = E->getType().getCanonicalType();

	// Reference types are ignored for mapping purposes.
	if (auto *RefTy = ExprTy->getAs<ReferenceType>())
	ExprTy = RefTy->getPointeeType().getCanonicalType();

	// Given that an array section is considered a built-in type, we need to
	// do the calculation based on the length of the section instead of relying
	// on CGF.getTypeSize(E->getType()).
	if (const auto *OAE = dyn_cast<OMPArraySectionExpr>(E)) {
	QualType BaseTy = OMPArraySectionExpr::getBaseOriginalType(
	OAE->getBase()->IgnoreParenImpCasts())
	.getCanonicalType();

	// If there is no length associated with the expression, that means we
	// are using the whole length of the base.
	if (!OAE->getLength() && OAE->getColonLoc().isValid())
	return CGF.getTypeSize(BaseTy);

	llvm::Value *ElemSize;
	if (auto *PTy = BaseTy->getAs<PointerType>())
	ElemSize = CGF.getTypeSize(PTy->getPointeeType().getCanonicalType());
	else {
	auto *ATy = cast<ArrayType>(BaseTy.getTypePtr());
	assert(ATy && "Expecting array type if not a pointer type.");
	ElemSize = CGF.getTypeSize(ATy->getElementType().getCanonicalType());
	}

	// If we don't have a length at this point, that is because we have an
	// array section with a single element.
	if (!OAE->getLength())
	return ElemSize;

	auto *LengthVal = CGF.EmitScalarExpr(OAE->getLength());
	LengthVal =
	CGF.Builder.CreateIntCast(LengthVal, CGF.SizeTy, /isSigned=/false);
	return CGF.Builder.CreateNUWMul(LengthVal, ElemSize);
	}
	return CGF.getTypeSize(ExprTy);
	}

	/// \brief Return the corresponding bits for a given map clause modifier. Add
	/// a flag marking the map as a pointer if requested. Add a flag marking the
	/// map as the first one of a series of maps that relate to the same map
	/// expression.
	unsigned getMapTypeBits(OpenMPMapClauseKind MapType,
	OpenMPMapClauseKind MapTypeModifier, bool AddPtrFlag,
	bool AddIsFirstFlag) const {
	unsigned Bits = 0u;
	switch (MapType) {
	case OMPC_MAP_alloc:
	case OMPC_MAP_release:
	// alloc and release is the default behavior in the runtime library, i.e.
	// if we don't pass any bits alloc/release that is what the runtime is
	// going to do. Therefore, we don't need to signal anything for these two
	// type modifiers.
	break;
	case OMPC_MAP_to:
	Bits = OMP_MAP_TO;
	break;
	case OMPC_MAP_from:
	Bits = OMP_MAP_FROM;
	break;
	case OMPC_MAP_tofrom:
	Bits = OMP_MAP_TO \| OMP_MAP_FROM;
	break;
	case OMPC_MAP_delete:
	Bits = OMP_MAP_DELETE;
	break;
	default:
	llvm_unreachable("Unexpected map type!");
	break;
	}
	if (AddPtrFlag)
	Bits \|= OMP_MAP_IS_PTR;
	if (AddIsFirstFlag)
	Bits \|= OMP_MAP_FIRST_REF;
	if (MapTypeModifier == OMPC_MAP_always)
	Bits \|= OMP_MAP_ALWAYS;
	return Bits;
	}

	/// \brief Return true if the provided expression is a final array section. A
	/// final array section, is one whose length can't be proved to be one.
	bool isFinalArraySectionExpression(const Expr *E) const {
	auto *OASE = dyn_cast<OMPArraySectionExpr>(E);

	// It is not an array section and therefore not a unity-size one.
	if (!OASE)
	return false;

	// An array section with no colon always refer to a single element.
	if (OASE->getColonLoc().isInvalid())
	return false;

	auto *Length = OASE->getLength();

	// If we don't have a length we have to check if the array has size 1
	// for this dimension. Also, we should always expect a length if the
	// base type is pointer.
	if (!Length) {
	auto BaseQTy = OMPArraySectionExpr::getBaseOriginalType(
	OASE->getBase()->IgnoreParenImpCasts())
	.getCanonicalType();
	if (auto *ATy = dyn_cast<ConstantArrayType>(BaseQTy.getTypePtr()))
	return ATy->getSize().getSExtValue() != 1;
	// If we don't have a constant dimension length, we have to consider
	// the current section as having any size, so it is not necessarily
	// unitary. If it happen to be unity size, that's user fault.
	return true;
	}

	// Check if the length evaluates to 1.
	llvm::APSInt ConstLength;
	if (!Length->EvaluateAsInt(ConstLength, CGF.getContext()))
	return true; // Can have more that size 1.

	return ConstLength.getSExtValue() != 1;
	}

	/// \brief Generate the base pointers, section pointers, sizes and map type
	/// bits for the provided map type, map modifier, and expression components.
	/// \a IsFirstComponent should be set to true if the provided set of
	/// components is the first associated with a capture.
	void generateInfoForComponentList(
	OpenMPMapClauseKind MapType, OpenMPMapClauseKind MapTypeModifier,
	OMPClauseMappableExprCommon::MappableExprComponentListRef Components,
	MapBaseValuesArrayTy &BasePointers, MapValuesArrayTy &Pointers,
	MapValuesArrayTy &Sizes, MapFlagsArrayTy &Types,
	bool IsFirstComponentList) const {

	// The following summarizes what has to be generated for each map and the
	// types bellow. The generated information is expressed in this order:
	// base pointer, section pointer, size, flags
	// (to add to the ones that come from the map type and modifier).
	//
	// double d;
	// int i[100];
	// float *p;
	//
	// struct S1 {
	// int i;
	// float f[50];
	// }
	// struct S2 {
	// int i;
	// float f[50];
	// S1 s;
	// double *p;
	// struct S2 *ps;
	// }
	// S2 s;
	// S2 *ps;
	//
	// map(d)
	// &d, &d, sizeof(double), noflags
	//
	// map(i)
	// &i, &i, 100*sizeof(int), noflags
	//
	// map(i[1:23])
	// &i(=&i[0]), &i[1], 23*sizeof(int), noflags
	//
	// map(p)
	// &p, &p, sizeof(float*), noflags
	//
	// map(p[1:24])
	// p, &p[1], 24*sizeof(float), noflags
	//
	// map(s)
	// &s, &s, sizeof(S2), noflags
	//
	// map(s.i)
	// &s, &(s.i), sizeof(int), noflags
	//
	// map(s.s.f)
	// &s, &(s.i.f), 50*sizeof(int), noflags
	//
	// map(s.p)
	// &s, &(s.p), sizeof(double*), noflags
	//
	// map(s.p[:22], s.a s.b)
	// &s, &(s.p), sizeof(double*), noflags
	// &(s.p), &(s.p[0]), 22*sizeof(double), ptr_flag + extra_flag
	//
	// map(s.ps)
	// &s, &(s.ps), sizeof(S2*), noflags
	//
	// map(s.ps->s.i)
	// &s, &(s.ps), sizeof(S2*), noflags
	// &(s.ps), &(s.ps->s.i), sizeof(int), ptr_flag + extra_flag
	//
	// map(s.ps->ps)
	// &s, &(s.ps), sizeof(S2*), noflags
	// &(s.ps), &(s.ps->ps), sizeof(S2*), ptr_flag + extra_flag
	//
	// map(s.ps->ps->ps)
	// &s, &(s.ps), sizeof(S2*), noflags
	// &(s.ps), &(s.ps->ps), sizeof(S2*), ptr_flag + extra_flag
	// &(s.ps->ps), &(s.ps->ps->ps), sizeof(S2*), ptr_flag + extra_flag
	//
	// map(s.ps->ps->s.f[:22])
	// &s, &(s.ps), sizeof(S2*), noflags
	// &(s.ps), &(s.ps->ps), sizeof(S2*), ptr_flag + extra_flag
	// &(s.ps->ps), &(s.ps->ps->s.f[0]), 22*sizeof(float), ptr_flag + extra_flag
	//
	// map(ps)
	// &ps, &ps, sizeof(S2*), noflags
	//
	// map(ps->i)
	// ps, &(ps->i), sizeof(int), noflags
	//
	// map(ps->s.f)
	// ps, &(ps->s.f[0]), 50*sizeof(float), noflags
	//
	// map(ps->p)
	// ps, &(ps->p), sizeof(double*), noflags
	//
	// map(ps->p[:22])
	// ps, &(ps->p), sizeof(double*), noflags
	// &(ps->p), &(ps->p[0]), 22*sizeof(double), ptr_flag + extra_flag
	//
	// map(ps->ps)
	// ps, &(ps->ps), sizeof(S2*), noflags
	//
	// map(ps->ps->s.i)
	// ps, &(ps->ps), sizeof(S2*), noflags
	// &(ps->ps), &(ps->ps->s.i), sizeof(int), ptr_flag + extra_flag
	//
	// map(ps->ps->ps)
	// ps, &(ps->ps), sizeof(S2*), noflags
	// &(ps->ps), &(ps->ps->ps), sizeof(S2*), ptr_flag + extra_flag
	//
	// map(ps->ps->ps->ps)
	// ps, &(ps->ps), sizeof(S2*), noflags
	// &(ps->ps), &(ps->ps->ps), sizeof(S2*), ptr_flag + extra_flag
	// &(ps->ps->ps), &(ps->ps->ps->ps), sizeof(S2*), ptr_flag + extra_flag
	//
	// map(ps->ps->ps->s.f[:22])
	// ps, &(ps->ps), sizeof(S2*), noflags
	// &(ps->ps), &(ps->ps->ps), sizeof(S2*), ptr_flag + extra_flag
	// &(ps->ps->ps), &(ps->ps->ps->s.f[0]), 22*sizeof(float), ptr_flag +
	// extra_flag

	// Track if the map information being generated is the first for a capture.
	bool IsCaptureFirstInfo = IsFirstComponentList;

	// Scan the components from the base to the complete expression.
	auto CI = Components.rbegin();
	auto CE = Components.rend();
	auto I = CI;

	// Track if the map information being generated is the first for a list of
	// components.
	bool IsExpressionFirstInfo = true;
	llvm::Value *BP = nullptr;

	if (auto *ME = dyn_cast<MemberExpr>(I->getAssociatedExpression())) {
	// The base is the 'this' pointer. The content of the pointer is going
	// to be the base of the field being mapped.
	BP = CGF.EmitScalarExpr(ME->getBase());
	} else {
	// The base is the reference to the variable.
	// BP = &Var.
	BP = CGF.EmitLValue(cast<DeclRefExpr>(I->getAssociatedExpression()))
	.getPointer();

	// If the variable is a pointer and is being dereferenced (i.e. is not
	// the last component), the base has to be the pointer itself, not its
	// reference. References are ignored for mapping purposes.
	QualType Ty =
	I->getAssociatedDeclaration()->getType().getNonReferenceType();
	if (Ty->isAnyPointerType() && std::next(I) != CE) {
	auto PtrAddr = CGF.MakeNaturalAlignAddrLValue(BP, Ty);
	BP = CGF.EmitLoadOfPointerLValue(PtrAddr.getAddress(),
	Ty->castAs<PointerType>())
	.getPointer();

	// We do not need to generate individual map information for the
	// pointer, it can be associated with the combined storage.
	++I;
	}
	}

	for (; I != CE; ++I) {
	auto Next = std::next(I);

	// We need to generate the addresses and sizes if this is the last
	// component, if the component is a pointer or if it is an array section
	// whose length can't be proved to be one. If this is a pointer, it
	// becomes the base address for the following components.

	// A final array section, is one whose length can't be proved to be one.
	bool IsFinalArraySection =
	isFinalArraySectionExpression(I->getAssociatedExpression());

	// Get information on whether the element is a pointer. Have to do a
	// special treatment for array sections given that they are built-in
	// types.
	const auto *OASE =
	dyn_cast<OMPArraySectionExpr>(I->getAssociatedExpression());
	bool IsPointer =
	(OASE &&
	OMPArraySectionExpr::getBaseOriginalType(OASE)
	.getCanonicalType()
	->isAnyPointerType()) \|\|
	I->getAssociatedExpression()->getType()->isAnyPointerType();

	if (Next == CE \|\| IsPointer \|\| IsFinalArraySection) {

	// If this is not the last component, we expect the pointer to be
	// associated with an array expression or member expression.
	assert((Next == CE \|\|
	isa<MemberExpr>(Next->getAssociatedExpression()) \|\|
	isa<ArraySubscriptExpr>(Next->getAssociatedExpression()) \|\|
	isa<OMPArraySectionExpr>(Next->getAssociatedExpression())) &&
	"Unexpected expression");

	auto *LB = CGF.EmitLValue(I->getAssociatedExpression()).getPointer();
	auto *Size = getExprTypeSize(I->getAssociatedExpression());

	// If we have a member expression and the current component is a
	// reference, we have to map the reference too. Whenever we have a
	// reference, the section that reference refers to is going to be a
	// load instruction from the storage assigned to the reference.
	if (isa<MemberExpr>(I->getAssociatedExpression()) &&
	I->getAssociatedDeclaration()->getType()->isReferenceType()) {
	auto *LI = cast<llvm::LoadInst>(LB);
	auto *RefAddr = LI->getPointerOperand();

	BasePointers.push_back(BP);
	Pointers.push_back(RefAddr);
	Sizes.push_back(CGF.getTypeSize(CGF.getContext().VoidPtrTy));
	Types.push_back(getMapTypeBits(
	/MapType/ OMPC_MAP_alloc, /MapTypeModifier=/OMPC_MAP_unknown,
	!IsExpressionFirstInfo, IsCaptureFirstInfo));
	IsExpressionFirstInfo = false;
	IsCaptureFirstInfo = false;
	// The reference will be the next base address.
	BP = RefAddr;
	}

	BasePointers.push_back(BP);
	Pointers.push_back(LB);
	Sizes.push_back(Size);

	// We need to add a pointer flag for each map that comes from the
	// same expression except for the first one. We also need to signal
	// this map is the first one that relates with the current capture
	// (there is a set of entries for each capture).
	Types.push_back(getMapTypeBits(MapType, MapTypeModifier,
	!IsExpressionFirstInfo,
	IsCaptureFirstInfo));

	// If we have a final array section, we are done with this expression.
	if (IsFinalArraySection)
	break;

	// The pointer becomes the base for the next element.
	if (Next != CE)
	BP = LB;

	IsExpressionFirstInfo = false;
	IsCaptureFirstInfo = false;
	continue;
	}
	}
	}

	/// \brief Return the adjusted map modifiers if the declaration a capture
	/// refers to appears in a first-private clause. This is expected to be used
	/// only with directives that start with 'target'.
	unsigned adjustMapModifiersForPrivateClauses(const CapturedStmt::Capture &Cap,
	unsigned CurrentModifiers) {
	assert(Cap.capturesVariable() && "Expected capture by reference only!");

	// A first private variable captured by reference will use only the
	// 'private ptr' and 'map to' flag. Return the right flags if the captured
	// declaration is known as first-private in this handler.
	if (FirstPrivateDecls.count(Cap.getCapturedVar()))
	return MappableExprsHandler::OMP_MAP_PRIVATE_PTR \|
	MappableExprsHandler::OMP_MAP_TO;

	// We didn't modify anything.
	return CurrentModifiers;
	}

	public:
	MappableExprsHandler(const OMPExecutableDirective &Dir, CodeGenFunction &CGF)
	: CurDir(Dir), CGF(CGF) {
	// Extract firstprivate clause information.
	for (const auto *C : Dir.getClausesOfKind<OMPFirstprivateClause>())
	for (const auto *D : C->varlists())
	FirstPrivateDecls.insert(
	cast<VarDecl>(cast<DeclRefExpr>(D)->getDecl())->getCanonicalDecl());
	// Extract device pointer clause information.
	for (const auto *C : Dir.getClausesOfKind<OMPIsDevicePtrClause>())
	for (auto L : C->component_lists())
	DevPointersMap[L.first].push_back(L.second);
	}

	/// \brief Generate all the base pointers, section pointers, sizes and map
	/// types for the extracted mappable expressions. Also, for each item that
	/// relates with a device pointer, a pair of the relevant declaration and
	/// index where it occurs is appended to the device pointers info array.
	void generateAllInfo(MapBaseValuesArrayTy &BasePointers,
	MapValuesArrayTy &Pointers, MapValuesArrayTy &Sizes,
	MapFlagsArrayTy &Types) const {
	BasePointers.clear();
	Pointers.clear();
	Sizes.clear();
	Types.clear();

	struct MapInfo {
	/// Kind that defines how a device pointer has to be returned.
	enum ReturnPointerKind {
	// Don't have to return any pointer.
	RPK_None,
	// Pointer is the base of the declaration.
	RPK_Base,
	// Pointer is a member of the base declaration - 'this'
	RPK_Member,
	// Pointer is a reference and a member of the base declaration - 'this'
	RPK_MemberReference,
	};
	OMPClauseMappableExprCommon::MappableExprComponentListRef Components;
	OpenMPMapClauseKind MapType;
	OpenMPMapClauseKind MapTypeModifier;
	ReturnPointerKind ReturnDevicePointer;

	MapInfo()
	: MapType(OMPC_MAP_unknown), MapTypeModifier(OMPC_MAP_unknown),
	ReturnDevicePointer(RPK_None) {}
	MapInfo(
	OMPClauseMappableExprCommon::MappableExprComponentListRef Components,
	OpenMPMapClauseKind MapType, OpenMPMapClauseKind MapTypeModifier,
	ReturnPointerKind ReturnDevicePointer)
	: Components(Components), MapType(MapType),
	MapTypeModifier(MapTypeModifier),
	ReturnDevicePointer(ReturnDevicePointer) {}
	};

	// We have to process the component lists that relate with the same
	// declaration in a single chunk so that we can generate the map flags
	// correctly. Therefore, we organize all lists in a map.
	llvm::DenseMap<const ValueDecl *, SmallVector<MapInfo, 8>> Info;

	// Helper function to fill the information map for the different supported
	// clauses.
	auto &&InfoGen = [&Info](
	const ValueDecl *D,
	OMPClauseMappableExprCommon::MappableExprComponentListRef L,
	OpenMPMapClauseKind MapType, OpenMPMapClauseKind MapModifier,
	MapInfo::ReturnPointerKind ReturnDevicePointer) {
	const ValueDecl *VD =
	D ? cast<ValueDecl>(D->getCanonicalDecl()) : nullptr;
	Info[VD].push_back({L, MapType, MapModifier, ReturnDevicePointer});
	};

	// FIXME: MSVC 2013 seems to require this-> to find member CurDir.
	for (auto *C : this->CurDir.getClausesOfKind<OMPMapClause>())
	for (auto L : C->component_lists())
	InfoGen(L.first, L.second, C->getMapType(), C->getMapTypeModifier(),
	MapInfo::RPK_None);
	for (auto *C : this->CurDir.getClausesOfKind<OMPToClause>())
	for (auto L : C->component_lists())
	InfoGen(L.first, L.second, OMPC_MAP_to, OMPC_MAP_unknown,
	MapInfo::RPK_None);
	for (auto *C : this->CurDir.getClausesOfKind<OMPFromClause>())
	for (auto L : C->component_lists())
	InfoGen(L.first, L.second, OMPC_MAP_from, OMPC_MAP_unknown,
	MapInfo::RPK_None);

	// Look at the use_device_ptr clause information and mark the existing map
	// entries as such. If there is no map information for an entry in the
	// use_device_ptr list, we create one with map type 'alloc' and zero size
	// section. It is the user fault if that was not mapped before.
	// FIXME: MSVC 2013 seems to require this-> to find member CurDir.
	for (auto *C : this->CurDir.getClausesOfKind<OMPUseDevicePtrClause>())
	for (auto L : C->component_lists()) {
	assert(!L.second.empty() && "Not expecting empty list of components!");
	const ValueDecl *VD = L.second.back().getAssociatedDeclaration();
	VD = cast<ValueDecl>(VD->getCanonicalDecl());
	auto *IE = L.second.back().getAssociatedExpression();
	// If the first component is a member expression, we have to look into
	// 'this', which maps to null in the map of map information. Otherwise
	// look directly for the information.
	auto It = Info.find(isa<MemberExpr>(IE) ? nullptr : VD);

	// We potentially have map information for this declaration already.
	// Look for the first set of components that refer to it.
	if (It != Info.end()) {
	auto CI = std::find_if(
	It->second.begin(), It->second.end(), [VD](const MapInfo &MI) {
	return MI.Components.back().getAssociatedDeclaration() == VD;
	});
	// If we found a map entry, signal that the pointer has to be returned
	// and move on to the next declaration.
	if (CI != It->second.end()) {
	CI->ReturnDevicePointer = isa<MemberExpr>(IE)
	? (VD->getType()->isReferenceType()
	? MapInfo::RPK_MemberReference
	: MapInfo::RPK_Member)
	: MapInfo::RPK_Base;
	continue;
	}
	}

	// We didn't find any match in our map information - generate a zero
	// size array section.
	// FIXME: MSVC 2013 seems to require this-> to find member CGF.
	llvm::Value *Ptr =
	this->CGF
	.EmitLoadOfLValue(this->CGF.EmitLValue(IE), SourceLocation())
	.getScalarVal();
	BasePointers.push_back({Ptr, VD});
	Pointers.push_back(Ptr);
	Sizes.push_back(llvm::Constant::getNullValue(this->CGF.SizeTy));
	Types.push_back(OMP_MAP_RETURN_PTR \| OMP_MAP_FIRST_REF);
	}

	for (auto &M : Info) {
	// We need to know when we generate information for the first component
	// associated with a capture, because the mapping flags depend on it.
	bool IsFirstComponentList = true;
	for (MapInfo &L : M.second) {
	assert(!L.Components.empty() &&
	"Not expecting declaration with no component lists.");

	// Remember the current base pointer index.
	unsigned CurrentBasePointersIdx = BasePointers.size();
	// FIXME: MSVC 2013 seems to require this-> to find the member method.
	this->generateInfoForComponentList(L.MapType, L.MapTypeModifier,
	L.Components, BasePointers, Pointers,
	Sizes, Types, IsFirstComponentList);

	// If this entry relates with a device pointer, set the relevant
	// declaration and add the 'return pointer' flag.
	if (IsFirstComponentList &&
	L.ReturnDevicePointer != MapInfo::RPK_None) {
	// If the pointer is not the base of the map, we need to skip the
	// base. If it is a reference in a member field, we also need to skip
	// the map of the reference.
	if (L.ReturnDevicePointer != MapInfo::RPK_Base) {
	++CurrentBasePointersIdx;
	if (L.ReturnDevicePointer == MapInfo::RPK_MemberReference)
	++CurrentBasePointersIdx;
	}
	assert(BasePointers.size() > CurrentBasePointersIdx &&
	"Unexpected number of mapped base pointers.");

	auto *RelevantVD = L.Components.back().getAssociatedDeclaration();
	assert(RelevantVD &&
	"No relevant declaration related with device pointer??");

	BasePointers[CurrentBasePointersIdx].setDevicePtrDecl(RelevantVD);
	Types[CurrentBasePointersIdx] \|= OMP_MAP_RETURN_PTR;
	}
	IsFirstComponentList = false;
	}
	}
	}

	/// \brief Generate the base pointers, section pointers, sizes and map types
	/// associated to a given capture.
	void generateInfoForCapture(const CapturedStmt::Capture *Cap,
	llvm::Value *Arg,
	MapBaseValuesArrayTy &BasePointers,
	MapValuesArrayTy &Pointers,
	MapValuesArrayTy &Sizes,
	MapFlagsArrayTy &Types) const {
	assert(!Cap->capturesVariableArrayType() &&
	"Not expecting to generate map info for a variable array type!");

	BasePointers.clear();
	Pointers.clear();
	Sizes.clear();
	Types.clear();

	// We need to know when we generating information for the first component
	// associated with a capture, because the mapping flags depend on it.
	bool IsFirstComponentList = true;

	const ValueDecl *VD =
	Cap->capturesThis()
	? nullptr
	: cast<ValueDecl>(Cap->getCapturedVar()->getCanonicalDecl());

	// If this declaration appears in a is_device_ptr clause we just have to
	// pass the pointer by value. If it is a reference to a declaration, we just
	// pass its value, otherwise, if it is a member expression, we need to map
	// 'to' the field.
	if (!VD) {
	auto It = DevPointersMap.find(VD);
	if (It != DevPointersMap.end()) {
	for (auto L : It->second) {
	generateInfoForComponentList(
	/MapType=/OMPC_MAP_to, /MapTypeModifier=/OMPC_MAP_unknown, L,
	BasePointers, Pointers, Sizes, Types, IsFirstComponentList);
	IsFirstComponentList = false;
	}
	return;
	}
	} else if (DevPointersMap.count(VD)) {
	BasePointers.push_back({Arg, VD});
	Pointers.push_back(Arg);
	Sizes.push_back(CGF.getTypeSize(CGF.getContext().VoidPtrTy));
	Types.push_back(OMP_MAP_PRIVATE_VAL \| OMP_MAP_FIRST_REF);
	return;
	}

	// FIXME: MSVC 2013 seems to require this-> to find member CurDir.
	for (auto *C : this->CurDir.getClausesOfKind<OMPMapClause>())
	for (auto L : C->decl_component_lists(VD)) {
	assert(L.first == VD &&
	"We got information for the wrong declaration??");
	assert(!L.second.empty() &&
	"Not expecting declaration with no component lists.");
	generateInfoForComponentList(C->getMapType(), C->getMapTypeModifier(),
	L.second, BasePointers, Pointers, Sizes,
	Types, IsFirstComponentList);
	IsFirstComponentList = false;
	}

	return;
	}

	/// \brief Generate the default map information for a given capture \a CI,
	/// record field declaration \a RI and captured value \a CV.
	void generateDefaultMapInfo(const CapturedStmt::Capture &CI,
	const FieldDecl &RI, llvm::Value *CV,
	MapBaseValuesArrayTy &CurBasePointers,
	MapValuesArrayTy &CurPointers,
	MapValuesArrayTy &CurSizes,
	MapFlagsArrayTy &CurMapTypes) {

	// Do the default mapping.
	if (CI.capturesThis()) {
	CurBasePointers.push_back(CV);
	CurPointers.push_back(CV);
	const PointerType *PtrTy = cast<PointerType>(RI.getType().getTypePtr());
	CurSizes.push_back(CGF.getTypeSize(PtrTy->getPointeeType()));
	// Default map type.
	CurMapTypes.push_back(OMP_MAP_TO \| OMP_MAP_FROM);
	} else if (CI.capturesVariableByCopy()) {
	CurBasePointers.push_back(CV);
	CurPointers.push_back(CV);
	if (!RI.getType()->isAnyPointerType()) {
	// We have to signal to the runtime captures passed by value that are
	// not pointers.
	CurMapTypes.push_back(OMP_MAP_PRIVATE_VAL);
	CurSizes.push_back(CGF.getTypeSize(RI.getType()));
	} else {
	// Pointers are implicitly mapped with a zero size and no flags
	// (other than first map that is added for all implicit maps).
	CurMapTypes.push_back(0u);
	CurSizes.push_back(llvm::Constant::getNullValue(CGF.SizeTy));
	}
	} else {
	assert(CI.capturesVariable() && "Expected captured reference.");
	CurBasePointers.push_back(CV);
	CurPointers.push_back(CV);

	const ReferenceType *PtrTy =
	cast<ReferenceType>(RI.getType().getTypePtr());
	QualType ElementType = PtrTy->getPointeeType();
	CurSizes.push_back(CGF.getTypeSize(ElementType));
	// The default map type for a scalar/complex type is 'to' because by
	// default the value doesn't have to be retrieved. For an aggregate
	// type, the default is 'tofrom'.
	CurMapTypes.push_back(ElementType->isAggregateType()
	? (OMP_MAP_TO \| OMP_MAP_FROM)
	: OMP_MAP_TO);

	// If we have a capture by reference we may need to add the private
	// pointer flag if the base declaration shows in some first-private
	// clause.
	CurMapTypes.back() =
	adjustMapModifiersForPrivateClauses(CI, CurMapTypes.back());
	}
	// Every default map produces a single argument, so, it is always the
	// first one.
	CurMapTypes.back() \|= OMP_MAP_FIRST_REF;
	}
	};

	enum OpenMPOffloadingReservedDeviceIDs {
	/// \brief Device ID if the device was not defined, runtime should get it
	/// from environment variables in the spec.
	OMP_DEVICEID_UNDEF = -1,
	};
	} // anonymous namespace

	/// \brief Emit the arrays used to pass the captures and map information to the
	/// offloading runtime library. If there is no map or capture information,
	/// return nullptr by reference.
	static void
	emitOffloadingArrays(CodeGenFunction &CGF,
	MappableExprsHandler::MapBaseValuesArrayTy &BasePointers,
	MappableExprsHandler::MapValuesArrayTy &Pointers,
	MappableExprsHandler::MapValuesArrayTy &Sizes,
	MappableExprsHandler::MapFlagsArrayTy &MapTypes,
	CGOpenMPRuntime::TargetDataInfo &Info) {
	auto &CGM = CGF.CGM;
	auto &Ctx = CGF.getContext();

	// Reset the array information.
	Info.clearArrayInfo();
	Info.NumberOfPtrs = BasePointers.size();

	if (Info.NumberOfPtrs) {
	// Detect if we have any capture size requiring runtime evaluation of the
	// size so that a constant array could be eventually used.
	bool hasRuntimeEvaluationCaptureSize = false;
	for (auto *S : Sizes)
	if (!isa<llvm::Constant>(S)) {
	hasRuntimeEvaluationCaptureSize = true;
	break;
	}

	llvm::APInt PointerNumAP(32, Info.NumberOfPtrs, /isSigned=/true);
	QualType PointerArrayType =
	Ctx.getConstantArrayType(Ctx.VoidPtrTy, PointerNumAP, ArrayType::Normal,
	/IndexTypeQuals=/0);

	Info.BasePointersArray =
	CGF.CreateMemTemp(PointerArrayType, ".offload_baseptrs").getPointer();
	Info.PointersArray =
	CGF.CreateMemTemp(PointerArrayType, ".offload_ptrs").getPointer();

	// If we don't have any VLA types or other types that require runtime
	// evaluation, we can use a constant array for the map sizes, otherwise we
	// need to fill up the arrays as we do for the pointers.
	if (hasRuntimeEvaluationCaptureSize) {
	QualType SizeArrayType = Ctx.getConstantArrayType(
	Ctx.getSizeType(), PointerNumAP, ArrayType::Normal,
	/IndexTypeQuals=/0);
	Info.SizesArray =
	CGF.CreateMemTemp(SizeArrayType, ".offload_sizes").getPointer();
	} else {
	// We expect all the sizes to be constant, so we collect them to create
	// a constant array.
	SmallVector<llvm::Constant *, 16> ConstSizes;
	for (auto S : Sizes)
	ConstSizes.push_back(cast<llvm::Constant>(S));

	auto *SizesArrayInit = llvm::ConstantArray::get(
	llvm::ArrayType::get(CGM.SizeTy, ConstSizes.size()), ConstSizes);
	auto *SizesArrayGbl = new llvm::GlobalVariable(
	CGM.getModule(), SizesArrayInit->getType(),
	/isConstant=/true, llvm::GlobalValue::PrivateLinkage,
	SizesArrayInit, ".offload_sizes");
	SizesArrayGbl->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global);
	Info.SizesArray = SizesArrayGbl;
	}

	// The map types are always constant so we don't need to generate code to
	// fill arrays. Instead, we create an array constant.
	llvm::Constant *MapTypesArrayInit =
	llvm::ConstantDataArray::get(CGF.Builder.getContext(), MapTypes);
	auto *MapTypesArrayGbl = new llvm::GlobalVariable(
	CGM.getModule(), MapTypesArrayInit->getType(),
	/isConstant=/true, llvm::GlobalValue::PrivateLinkage,
	MapTypesArrayInit, ".offload_maptypes");
	MapTypesArrayGbl->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global);
	Info.MapTypesArray = MapTypesArrayGbl;

	for (unsigned i = 0; i < Info.NumberOfPtrs; ++i) {
	llvm::Value BPVal = BasePointers[i];
	if (BPVal->getType()->isPointerTy())
	BPVal = CGF.Builder.CreateBitCast(BPVal, CGM.VoidPtrTy);
	else {
	assert(BPVal->getType()->isIntegerTy() &&
	"If not a pointer, the value type must be an integer.");
	BPVal = CGF.Builder.CreateIntToPtr(BPVal, CGM.VoidPtrTy);
	}
	llvm::Value *BP = CGF.Builder.CreateConstInBoundsGEP2_32(
	llvm::ArrayType::get(CGM.VoidPtrTy, Info.NumberOfPtrs),
	Info.BasePointersArray, 0, i);
	Address BPAddr(BP, Ctx.getTypeAlignInChars(Ctx.VoidPtrTy));
	CGF.Builder.CreateStore(BPVal, BPAddr);

	if (Info.requiresDevicePointerInfo())
	if (auto *DevVD = BasePointers[i].getDevicePtrDecl())
	Info.CaptureDeviceAddrMap.insert(std::make_pair(DevVD, BPAddr));

	llvm::Value *PVal = Pointers[i];
	if (PVal->getType()->isPointerTy())
	PVal = CGF.Builder.CreateBitCast(PVal, CGM.VoidPtrTy);
	else {
	assert(PVal->getType()->isIntegerTy() &&
	"If not a pointer, the value type must be an integer.");
	PVal = CGF.Builder.CreateIntToPtr(PVal, CGM.VoidPtrTy);
	}
	llvm::Value *P = CGF.Builder.CreateConstInBoundsGEP2_32(
	llvm::ArrayType::get(CGM.VoidPtrTy, Info.NumberOfPtrs),
	Info.PointersArray, 0, i);
	Address PAddr(P, Ctx.getTypeAlignInChars(Ctx.VoidPtrTy));
	CGF.Builder.CreateStore(PVal, PAddr);

	if (hasRuntimeEvaluationCaptureSize) {
	llvm::Value *S = CGF.Builder.CreateConstInBoundsGEP2_32(
	llvm::ArrayType::get(CGM.SizeTy, Info.NumberOfPtrs),
	Info.SizesArray,
	/Idx0=/0,
	/Idx1=/i);
	Address SAddr(S, Ctx.getTypeAlignInChars(Ctx.getSizeType()));
	CGF.Builder.CreateStore(
	CGF.Builder.CreateIntCast(Sizes[i], CGM.SizeTy, /isSigned=/true),
	SAddr);
	}
	}
	}
	}
	/// \brief Emit the arguments to be passed to the runtime library based on the
	/// arrays of pointers, sizes and map types.
	static void emitOffloadingArraysArgument(
	CodeGenFunction &CGF, llvm::Value *&BasePointersArrayArg,
	llvm::Value &PointersArrayArg, llvm::Value &SizesArrayArg,
	llvm::Value *&MapTypesArrayArg, CGOpenMPRuntime::TargetDataInfo &Info) {
	auto &CGM = CGF.CGM;
	if (Info.NumberOfPtrs) {
	BasePointersArrayArg = CGF.Builder.CreateConstInBoundsGEP2_32(
	llvm::ArrayType::get(CGM.VoidPtrTy, Info.NumberOfPtrs),
	Info.BasePointersArray,
	/Idx0=/0, /Idx1=/0);
	PointersArrayArg = CGF.Builder.CreateConstInBoundsGEP2_32(
	llvm::ArrayType::get(CGM.VoidPtrTy, Info.NumberOfPtrs),
	Info.PointersArray,
	/Idx0=/0,
	/Idx1=/0);
	SizesArrayArg = CGF.Builder.CreateConstInBoundsGEP2_32(
	llvm::ArrayType::get(CGM.SizeTy, Info.NumberOfPtrs), Info.SizesArray,
	/Idx0=/0, /Idx1=/0);
	MapTypesArrayArg = CGF.Builder.CreateConstInBoundsGEP2_32(
	llvm::ArrayType::get(CGM.Int32Ty, Info.NumberOfPtrs),
	Info.MapTypesArray,
	/Idx0=/0,
	/Idx1=/0);
	} else {
	BasePointersArrayArg = llvm::ConstantPointerNull::get(CGM.VoidPtrPtrTy);
	PointersArrayArg = llvm::ConstantPointerNull::get(CGM.VoidPtrPtrTy);
	SizesArrayArg = llvm::ConstantPointerNull::get(CGM.SizeTy->getPointerTo());
	MapTypesArrayArg =
	llvm::ConstantPointerNull::get(CGM.Int32Ty->getPointerTo());
	}
	}

	void CGOpenMPRuntime::emitTargetCall(CodeGenFunction &CGF,
	const OMPExecutableDirective &D,
	llvm::Value *OutlinedFn,
	llvm::Value *OutlinedFnID,
	const Expr IfCond, const Expr Device,
	ArrayRef<llvm::Value *> CapturedVars) {
	if (!CGF.HaveInsertPoint())
	return;

	assert(OutlinedFn && "Invalid outlined function!");

	auto &Ctx = CGF.getContext();

	// Fill up the arrays with all the captured variables.
	MappableExprsHandler::MapValuesArrayTy KernelArgs;
	MappableExprsHandler::MapBaseValuesArrayTy BasePointers;
	MappableExprsHandler::MapValuesArrayTy Pointers;
	MappableExprsHandler::MapValuesArrayTy Sizes;
	MappableExprsHandler::MapFlagsArrayTy MapTypes;

	MappableExprsHandler::MapBaseValuesArrayTy CurBasePointers;
	MappableExprsHandler::MapValuesArrayTy CurPointers;
	MappableExprsHandler::MapValuesArrayTy CurSizes;
	MappableExprsHandler::MapFlagsArrayTy CurMapTypes;

	// Get mappable expression information.
	MappableExprsHandler MEHandler(D, CGF);

	const CapturedStmt &CS = *cast<CapturedStmt>(D.getAssociatedStmt());
	auto RI = CS.getCapturedRecordDecl()->field_begin();
	auto CV = CapturedVars.begin();
	for (CapturedStmt::const_capture_iterator CI = CS.capture_begin(),
	CE = CS.capture_end();
	CI != CE; ++CI, ++RI, ++CV) {
	StringRef Name;
	QualType Ty;

	CurBasePointers.clear();
	CurPointers.clear();
	CurSizes.clear();
	CurMapTypes.clear();

	// VLA sizes are passed to the outlined region by copy and do not have map
	// information associated.
	if (CI->capturesVariableArrayType()) {
	CurBasePointers.push_back(*CV);
	CurPointers.push_back(*CV);
	CurSizes.push_back(CGF.getTypeSize(RI->getType()));
	// Copy to the device as an argument. No need to retrieve it.
	CurMapTypes.push_back(MappableExprsHandler::OMP_MAP_PRIVATE_VAL \|
	MappableExprsHandler::OMP_MAP_FIRST_REF);
	} else {
	// If we have any information in the map clause, we use it, otherwise we
	// just do a default mapping.
	MEHandler.generateInfoForCapture(CI, *CV, CurBasePointers, CurPointers,
	CurSizes, CurMapTypes);
	if (CurBasePointers.empty())
	MEHandler.generateDefaultMapInfo(CI, RI, CV, CurBasePointers,
	CurPointers, CurSizes, CurMapTypes);
	}
	// We expect to have at least an element of information for this capture.
	assert(!CurBasePointers.empty() && "Non-existing map pointer for capture!");
	assert(CurBasePointers.size() == CurPointers.size() &&
	CurBasePointers.size() == CurSizes.size() &&
	CurBasePointers.size() == CurMapTypes.size() &&
	"Inconsistent map information sizes!");

	// The kernel args are always the first elements of the base pointers
	// associated with a capture.
	KernelArgs.push_back(*CurBasePointers.front());
	// We need to append the results of this capture to what we already have.
	BasePointers.append(CurBasePointers.begin(), CurBasePointers.end());
	Pointers.append(CurPointers.begin(), CurPointers.end());
	Sizes.append(CurSizes.begin(), CurSizes.end());
	MapTypes.append(CurMapTypes.begin(), CurMapTypes.end());
	}

	// Keep track on whether the host function has to be executed.
	auto OffloadErrorQType =
	Ctx.getIntTypeForBitwidth(/DestWidth=/32, /Signed=/true);
	auto OffloadError = CGF.MakeAddrLValue(
	CGF.CreateMemTemp(OffloadErrorQType, ".run_host_version"),
	OffloadErrorQType);
	CGF.EmitStoreOfScalar(llvm::Constant::getNullValue(CGM.Int32Ty),
	OffloadError);

	// Fill up the pointer arrays and transfer execution to the device.
	auto &&ThenGen = [&Ctx, &BasePointers, &Pointers, &Sizes, &MapTypes, Device,
	OutlinedFnID, OffloadError, OffloadErrorQType,
	&D](CodeGenFunction &CGF, PrePostActionTy &) {
	auto &RT = CGF.CGM.getOpenMPRuntime();
	// Emit the offloading arrays.
	TargetDataInfo Info;
	emitOffloadingArrays(CGF, BasePointers, Pointers, Sizes, MapTypes, Info);
	emitOffloadingArraysArgument(CGF, Info.BasePointersArray,
	Info.PointersArray, Info.SizesArray,
	Info.MapTypesArray, Info);

	// On top of the arrays that were filled up, the target offloading call
	// takes as arguments the device id as well as the host pointer. The host
	// pointer is used by the runtime library to identify the current target
	// region, so it only has to be unique and not necessarily point to
	// anything. It could be the pointer to the outlined function that
	// implements the target region, but we aren't using that so that the
	// compiler doesn't need to keep that, and could therefore inline the host
	// function if proven worthwhile during optimization.

	// From this point on, we need to have an ID of the target region defined.
	assert(OutlinedFnID && "Invalid outlined function ID!");

	// Emit device ID if any.
	llvm::Value *DeviceID;
	if (Device)
	DeviceID = CGF.Builder.CreateIntCast(CGF.EmitScalarExpr(Device),
	CGF.Int32Ty, /isSigned=/true);
	else
	DeviceID = CGF.Builder.getInt32(OMP_DEVICEID_UNDEF);

	// Emit the number of elements in the offloading arrays.
	llvm::Value *PointerNum = CGF.Builder.getInt32(BasePointers.size());

	// Return value of the runtime offloading call.
	llvm::Value *Return;

	auto *NumTeams = emitNumTeamsClauseForTargetDirective(RT, CGF, D);
	auto *ThreadLimit = emitThreadLimitClauseForTargetDirective(RT, CGF, D);

	// If we have NumTeams defined this means that we have an enclosed teams
	// region. Therefore we also expect to have ThreadLimit defined. These two
	// values should be defined in the presence of a teams directive, regardless
	// of having any clauses associated. If the user is using teams but no
	// clauses, these two values will be the default that should be passed to
	// the runtime library - a 32-bit integer with the value zero.
	if (NumTeams) {
	assert(ThreadLimit && "Thread limit expression should be available along "
	"with number of teams.");
	llvm::Value *OffloadingArgs[] = {
	DeviceID, OutlinedFnID,
	PointerNum, Info.BasePointersArray,
	Info.PointersArray, Info.SizesArray,
	Info.MapTypesArray, NumTeams,
	ThreadLimit};
	Return = CGF.EmitRuntimeCall(
	RT.createRuntimeFunction(OMPRTL__tgt_target_teams), OffloadingArgs);
	} else {
	llvm::Value *OffloadingArgs[] = {
	DeviceID, OutlinedFnID,
	PointerNum, Info.BasePointersArray,
	Info.PointersArray, Info.SizesArray,
	Info.MapTypesArray};
	Return = CGF.EmitRuntimeCall(RT.createRuntimeFunction(OMPRTL__tgt_target),
	OffloadingArgs);
	}

	CGF.EmitStoreOfScalar(Return, OffloadError);
	};

	// Notify that the host version must be executed.
	auto &&ElseGen = [OffloadError](CodeGenFunction &CGF, PrePostActionTy &) {
	CGF.EmitStoreOfScalar(llvm::ConstantInt::get(CGF.Int32Ty, /V=/-1u),
	OffloadError);
	};

	// If we have a target function ID it means that we need to support
	// offloading, otherwise, just execute on the host. We need to execute on host
	// regardless of the conditional in the if clause if, e.g., the user do not
	// specify target triples.
	if (OutlinedFnID) {
	if (IfCond)
	emitOMPIfClause(CGF, IfCond, ThenGen, ElseGen);
	else {
	RegionCodeGenTy ThenRCG(ThenGen);
	ThenRCG(CGF);
	}
	} else {
	RegionCodeGenTy ElseRCG(ElseGen);
	ElseRCG(CGF);
	}

	// Check the error code and execute the host version if required.
	auto OffloadFailedBlock = CGF.createBasicBlock("omp_offload.failed");
	auto OffloadContBlock = CGF.createBasicBlock("omp_offload.cont");
	auto OffloadErrorVal = CGF.EmitLoadOfScalar(OffloadError, SourceLocation());
	auto Failed = CGF.Builder.CreateIsNotNull(OffloadErrorVal);
	CGF.Builder.CreateCondBr(Failed, OffloadFailedBlock, OffloadContBlock);

	CGF.EmitBlock(OffloadFailedBlock);
	CGF.Builder.CreateCall(OutlinedFn, KernelArgs);
	CGF.EmitBranch(OffloadContBlock);

	CGF.EmitBlock(OffloadContBlock, /IsFinished=/true);
	}

	void CGOpenMPRuntime::scanForTargetRegionsFunctions(const Stmt *S,
	StringRef ParentName) {
	if (!S)
	return;

	// If we find a OMP target directive, codegen the outline function and
	// register the result.
	// FIXME: Add other directives with target when they become supported.
	bool isTargetDirective = isa<OMPTargetDirective>(S);

	if (isTargetDirective) {
	auto *E = cast<OMPExecutableDirective>(S);
	unsigned DeviceID;
	unsigned FileID;
	unsigned Line;
	getTargetEntryUniqueInfo(CGM.getContext(), E->getLocStart(), DeviceID,
	FileID, Line);

	// Is this a target region that should not be emitted as an entry point? If
	// so just signal we are done with this target region.
	if (!OffloadEntriesInfoManager.hasTargetRegionEntryInfo(DeviceID, FileID,
	ParentName, Line))
	return;

	llvm::Function *Fn;
	llvm::Constant *Addr;
	std::tie(Fn, Addr) =
	CodeGenFunction::EmitOMPTargetDirectiveOutlinedFunction(
	CGM, cast<OMPTargetDirective>(*E), ParentName,
	/isOffloadEntry=/true);
	assert(Fn && Addr && "Target region emission failed.");
	return;
	}

	if (const OMPExecutableDirective *E = dyn_cast<OMPExecutableDirective>(S)) {
	if (!E->hasAssociatedStmt())
	return;

	scanForTargetRegionsFunctions(
	cast<CapturedStmt>(E->getAssociatedStmt())->getCapturedStmt(),
	ParentName);
	return;
	}

	// If this is a lambda function, look into its body.
	if (auto *L = dyn_cast<LambdaExpr>(S))
	S = L->getBody();

	// Keep looking for target regions recursively.
	for (auto *II : S->children())
	scanForTargetRegionsFunctions(II, ParentName);
	}

	bool CGOpenMPRuntime::emitTargetFunctions(GlobalDecl GD) {
	auto &FD = *cast<FunctionDecl>(GD.getDecl());

	// If emitting code for the host, we do not process FD here. Instead we do
	// the normal code generation.
	if (!CGM.getLangOpts().OpenMPIsDevice)
	return false;

	// Try to detect target regions in the function.
	scanForTargetRegionsFunctions(FD.getBody(), CGM.getMangledName(GD));

	// We should not emit any function other that the ones created during the
	// scanning. Therefore, we signal that this function is completely dealt
	// with.
	return true;
	}

	bool CGOpenMPRuntime::emitTargetGlobalVariable(GlobalDecl GD) {
	if (!CGM.getLangOpts().OpenMPIsDevice)
	return false;

	// Check if there are Ctors/Dtors in this declaration and look for target
	// regions in it. We use the complete variant to produce the kernel name
	// mangling.
	QualType RDTy = cast<VarDecl>(GD.getDecl())->getType();
	if (auto *RD = RDTy->getBaseElementTypeUnsafe()->getAsCXXRecordDecl()) {
	for (auto *Ctor : RD->ctors()) {
	StringRef ParentName =
	CGM.getMangledName(GlobalDecl(Ctor, Ctor_Complete));
	scanForTargetRegionsFunctions(Ctor->getBody(), ParentName);
	}
	auto *Dtor = RD->getDestructor();
	if (Dtor) {
	StringRef ParentName =
	CGM.getMangledName(GlobalDecl(Dtor, Dtor_Complete));
	scanForTargetRegionsFunctions(Dtor->getBody(), ParentName);
	}
	}

	// If we are in target mode we do not emit any global (declare target is not
	// implemented yet). Therefore we signal that GD was processed in this case.
	return true;
	}

	bool CGOpenMPRuntime::emitTargetGlobal(GlobalDecl GD) {
	auto *VD = GD.getDecl();
	if (isa<FunctionDecl>(VD))
	return emitTargetFunctions(GD);

	return emitTargetGlobalVariable(GD);
	}

	llvm::Function *CGOpenMPRuntime::emitRegistrationFunction() {
	// If we have offloading in the current module, we need to emit the entries
	// now and register the offloading descriptor.
	createOffloadEntriesAndInfoMetadata();

	// Create and register the offloading binary descriptors. This is the main
	// entity that captures all the information about offloading in the current
	// compilation unit.
	return createOffloadingBinaryDescriptorRegistration();
	}

	void CGOpenMPRuntime::emitTeamsCall(CodeGenFunction &CGF,
	const OMPExecutableDirective &D,
	SourceLocation Loc,
	llvm::Value *OutlinedFn,
	ArrayRef<llvm::Value *> CapturedVars) {
	if (!CGF.HaveInsertPoint())
	return;

	auto *RTLoc = emitUpdateLocation(CGF, Loc);
	CodeGenFunction::RunCleanupsScope Scope(CGF);

	// Build call __kmpc_fork_teams(loc, n, microtask, var1, .., varn);
	llvm::Value *Args[] = {
	RTLoc,
	CGF.Builder.getInt32(CapturedVars.size()), // Number of captured vars
	CGF.Builder.CreateBitCast(OutlinedFn, getKmpc_MicroPointerTy())};
	llvm::SmallVector<llvm::Value *, 16> RealArgs;
	RealArgs.append(std::begin(Args), std::end(Args));
	RealArgs.append(CapturedVars.begin(), CapturedVars.end());

	auto RTLFn = createRuntimeFunction(OMPRTL__kmpc_fork_teams);
	CGF.EmitRuntimeCall(RTLFn, RealArgs);
	}

	void CGOpenMPRuntime::emitNumTeamsClause(CodeGenFunction &CGF,
	const Expr *NumTeams,
	const Expr *ThreadLimit,
	SourceLocation Loc) {
	if (!CGF.HaveInsertPoint())
	return;

	auto *RTLoc = emitUpdateLocation(CGF, Loc);

	llvm::Value *NumTeamsVal =
	(NumTeams)
	? CGF.Builder.CreateIntCast(CGF.EmitScalarExpr(NumTeams),
	CGF.CGM.Int32Ty, /* isSigned = */ true)
	: CGF.Builder.getInt32(0);

	llvm::Value *ThreadLimitVal =
	(ThreadLimit)
	? CGF.Builder.CreateIntCast(CGF.EmitScalarExpr(ThreadLimit),
	CGF.CGM.Int32Ty, /* isSigned = */ true)
	: CGF.Builder.getInt32(0);

	// Build call __kmpc_push_num_teamss(&loc, global_tid, num_teams, thread_limit)
	llvm::Value *PushNumTeamsArgs[] = {RTLoc, getThreadID(CGF, Loc), NumTeamsVal,
	ThreadLimitVal};
	CGF.EmitRuntimeCall(createRuntimeFunction(OMPRTL__kmpc_push_num_teams),
	PushNumTeamsArgs);
	}

	void CGOpenMPRuntime::emitTargetDataCalls(
	CodeGenFunction &CGF, const OMPExecutableDirective &D, const Expr *IfCond,
	const Expr *Device, const RegionCodeGenTy &CodeGen, TargetDataInfo &Info) {
	if (!CGF.HaveInsertPoint())
	return;

	// Action used to replace the default codegen action and turn privatization
	// off.
	PrePostActionTy NoPrivAction;

	// Generate the code for the opening of the data environment. Capture all the
	// arguments of the runtime call by reference because they are used in the
	// closing of the region.
	auto &&BeginThenGen = [&D, &CGF, Device, &Info, &CodeGen, &NoPrivAction](
	CodeGenFunction &CGF, PrePostActionTy &) {
	// Fill up the arrays with all the mapped variables.
	MappableExprsHandler::MapBaseValuesArrayTy BasePointers;
	MappableExprsHandler::MapValuesArrayTy Pointers;
	MappableExprsHandler::MapValuesArrayTy Sizes;
	MappableExprsHandler::MapFlagsArrayTy MapTypes;

	// Get map clause information.
	MappableExprsHandler MCHandler(D, CGF);
	MCHandler.generateAllInfo(BasePointers, Pointers, Sizes, MapTypes);

	// Fill up the arrays and create the arguments.
	emitOffloadingArrays(CGF, BasePointers, Pointers, Sizes, MapTypes, Info);

	llvm::Value *BasePointersArrayArg = nullptr;
	llvm::Value *PointersArrayArg = nullptr;
	llvm::Value *SizesArrayArg = nullptr;
	llvm::Value *MapTypesArrayArg = nullptr;
	emitOffloadingArraysArgument(CGF, BasePointersArrayArg, PointersArrayArg,
	SizesArrayArg, MapTypesArrayArg, Info);

	// Emit device ID if any.
	llvm::Value *DeviceID = nullptr;
	if (Device)
	DeviceID = CGF.Builder.CreateIntCast(CGF.EmitScalarExpr(Device),
	CGF.Int32Ty, /isSigned=/true);
	else
	DeviceID = CGF.Builder.getInt32(OMP_DEVICEID_UNDEF);

	// Emit the number of elements in the offloading arrays.
	auto *PointerNum = CGF.Builder.getInt32(Info.NumberOfPtrs);

	llvm::Value *OffloadingArgs[] = {
	DeviceID, PointerNum, BasePointersArrayArg,
	PointersArrayArg, SizesArrayArg, MapTypesArrayArg};
	auto &RT = CGF.CGM.getOpenMPRuntime();
	CGF.EmitRuntimeCall(RT.createRuntimeFunction(OMPRTL__tgt_target_data_begin),
	OffloadingArgs);

	// If device pointer privatization is required, emit the body of the region
	// here. It will have to be duplicated: with and without privatization.
	if (!Info.CaptureDeviceAddrMap.empty())
	CodeGen(CGF);
	};

	// Generate code for the closing of the data region.
	auto &&EndThenGen = [&CGF, Device, &Info](CodeGenFunction &CGF,
	PrePostActionTy &) {
	assert(Info.isValid() && "Invalid data environment closing arguments.");

	llvm::Value *BasePointersArrayArg = nullptr;
	llvm::Value *PointersArrayArg = nullptr;
	llvm::Value *SizesArrayArg = nullptr;
	llvm::Value *MapTypesArrayArg = nullptr;
	emitOffloadingArraysArgument(CGF, BasePointersArrayArg, PointersArrayArg,
	SizesArrayArg, MapTypesArrayArg, Info);

	// Emit device ID if any.
	llvm::Value *DeviceID = nullptr;
	if (Device)
	DeviceID = CGF.Builder.CreateIntCast(CGF.EmitScalarExpr(Device),
	CGF.Int32Ty, /isSigned=/true);
	else
	DeviceID = CGF.Builder.getInt32(OMP_DEVICEID_UNDEF);

	// Emit the number of elements in the offloading arrays.
	auto *PointerNum = CGF.Builder.getInt32(Info.NumberOfPtrs);

	llvm::Value *OffloadingArgs[] = {
	DeviceID, PointerNum, BasePointersArrayArg,
	PointersArrayArg, SizesArrayArg, MapTypesArrayArg};
	auto &RT = CGF.CGM.getOpenMPRuntime();
	CGF.EmitRuntimeCall(RT.createRuntimeFunction(OMPRTL__tgt_target_data_end),
	OffloadingArgs);
	};

	// If we need device pointer privatization, we need to emit the body of the
	// region with no privatization in the 'else' branch of the conditional.
	// Otherwise, we don't have to do anything.
	auto &&BeginElseGen = [&Info, &CodeGen, &NoPrivAction](CodeGenFunction &CGF,
	PrePostActionTy &) {
	if (!Info.CaptureDeviceAddrMap.empty()) {
	CodeGen.setAction(NoPrivAction);
	CodeGen(CGF);
	}
	};

	// We don't have to do anything to close the region if the if clause evaluates
	// to false.
	auto &&EndElseGen = [](CodeGenFunction &CGF, PrePostActionTy &) {};

	if (IfCond) {
	emitOMPIfClause(CGF, IfCond, BeginThenGen, BeginElseGen);
	} else {
	RegionCodeGenTy RCG(BeginThenGen);
	RCG(CGF);
	}

	// If we don't require privatization of device pointers, we emit the body in
	// between the runtime calls. This avoids duplicating the body code.
	if (Info.CaptureDeviceAddrMap.empty()) {
	CodeGen.setAction(NoPrivAction);
	CodeGen(CGF);
	}

	if (IfCond) {
	emitOMPIfClause(CGF, IfCond, EndThenGen, EndElseGen);
	} else {
	RegionCodeGenTy RCG(EndThenGen);
	RCG(CGF);
	}
	}

	void CGOpenMPRuntime::emitTargetDataStandAloneCall(
	CodeGenFunction &CGF, const OMPExecutableDirective &D, const Expr *IfCond,
	const Expr *Device) {
	if (!CGF.HaveInsertPoint())
	return;

	assert((isa<OMPTargetEnterDataDirective>(D) \|\|
	isa<OMPTargetExitDataDirective>(D) \|\|
	isa<OMPTargetUpdateDirective>(D)) &&
	"Expecting either target enter, exit data, or update directives.");

	// Generate the code for the opening of the data environment.
	auto &&ThenGen = [&D, &CGF, Device](CodeGenFunction &CGF, PrePostActionTy &) {
	// Fill up the arrays with all the mapped variables.
	MappableExprsHandler::MapBaseValuesArrayTy BasePointers;
	MappableExprsHandler::MapValuesArrayTy Pointers;
	MappableExprsHandler::MapValuesArrayTy Sizes;
	MappableExprsHandler::MapFlagsArrayTy MapTypes;

	// Get map clause information.
	MappableExprsHandler MEHandler(D, CGF);
	MEHandler.generateAllInfo(BasePointers, Pointers, Sizes, MapTypes);

	// Fill up the arrays and create the arguments.
	TargetDataInfo Info;
	emitOffloadingArrays(CGF, BasePointers, Pointers, Sizes, MapTypes, Info);
	emitOffloadingArraysArgument(CGF, Info.BasePointersArray,
	Info.PointersArray, Info.SizesArray,
	Info.MapTypesArray, Info);

	// Emit device ID if any.
	llvm::Value *DeviceID = nullptr;
	if (Device)
	DeviceID = CGF.Builder.CreateIntCast(CGF.EmitScalarExpr(Device),
	CGF.Int32Ty, /isSigned=/true);
	else
	DeviceID = CGF.Builder.getInt32(OMP_DEVICEID_UNDEF);

	// Emit the number of elements in the offloading arrays.
	auto *PointerNum = CGF.Builder.getInt32(BasePointers.size());

	llvm::Value *OffloadingArgs[] = {
	DeviceID, PointerNum, Info.BasePointersArray,
	Info.PointersArray, Info.SizesArray, Info.MapTypesArray};

	auto &RT = CGF.CGM.getOpenMPRuntime();
	// Select the right runtime function call for each expected standalone
	// directive.
	OpenMPRTLFunction RTLFn;
	switch (D.getDirectiveKind()) {
	default:
	llvm_unreachable("Unexpected standalone target data directive.");
	break;
	case OMPD_target_enter_data:
	RTLFn = OMPRTL__tgt_target_data_begin;
	break;
	case OMPD_target_exit_data:
	RTLFn = OMPRTL__tgt_target_data_end;
	break;
	case OMPD_target_update:
	RTLFn = OMPRTL__tgt_target_data_update;
	break;
	}
	CGF.EmitRuntimeCall(RT.createRuntimeFunction(RTLFn), OffloadingArgs);
	};

	// In the event we get an if clause, we don't have to take any action on the
	// else side.
	auto &&ElseGen = [](CodeGenFunction &CGF, PrePostActionTy &) {};

	if (IfCond) {
	emitOMPIfClause(CGF, IfCond, ThenGen, ElseGen);
	} else {
	RegionCodeGenTy ThenGenRCG(ThenGen);
	ThenGenRCG(CGF);
	}
	}

	namespace {
	/// Kind of parameter in a function with 'declare simd' directive.
	enum ParamKindTy { LinearWithVarStride, Linear, Uniform, Vector };
	/// Attribute set of the parameter.
	struct ParamAttrTy {
	ParamKindTy Kind = Vector;
	llvm::APSInt StrideOrArg;
	llvm::APSInt Alignment;
	};
	} // namespace

	static unsigned evaluateCDTSize(const FunctionDecl *FD,
	ArrayRef<ParamAttrTy> ParamAttrs) {
	// Every vector variant of a SIMD-enabled function has a vector length (VLEN).
	// If OpenMP clause "simdlen" is used, the VLEN is the value of the argument
	// of that clause. The VLEN value must be power of 2.
	// In other case the notion of the function`s "characteristic data type" (CDT)
	// is used to compute the vector length.
	// CDT is defined in the following order:
	// a) For non-void function, the CDT is the return type.
	// b) If the function has any non-uniform, non-linear parameters, then the
	// CDT is the type of the first such parameter.
	// c) If the CDT determined by a) or b) above is struct, union, or class
	// type which is pass-by-value (except for the type that maps to the
	// built-in complex data type), the characteristic data type is int.
	// d) If none of the above three cases is applicable, the CDT is int.
	// The VLEN is then determined based on the CDT and the size of vector
	// register of that ISA for which current vector version is generated. The
	// VLEN is computed using the formula below:
	// VLEN = sizeof(vector_register) / sizeof(CDT),
	// where vector register size specified in section 3.2.1 Registers and the
	// Stack Frame of original AMD64 ABI document.
	QualType RetType = FD->getReturnType();
	if (RetType.isNull())
	return 0;
	ASTContext &C = FD->getASTContext();
	QualType CDT;
	if (!RetType.isNull() && !RetType->isVoidType())
	CDT = RetType;
	else {
	unsigned Offset = 0;
	if (auto *MD = dyn_cast<CXXMethodDecl>(FD)) {
	if (ParamAttrs[Offset].Kind == Vector)
	CDT = C.getPointerType(C.getRecordType(MD->getParent()));
	++Offset;
	}
	if (CDT.isNull()) {
	for (unsigned I = 0, E = FD->getNumParams(); I < E; ++I) {
	if (ParamAttrs[I + Offset].Kind == Vector) {
	CDT = FD->getParamDecl(I)->getType();
	break;
	}
	}
	}
	}
	if (CDT.isNull())
	CDT = C.IntTy;
	CDT = CDT->getCanonicalTypeUnqualified();
	if (CDT->isRecordType() \|\| CDT->isUnionType())
	CDT = C.IntTy;
	return C.getTypeSize(CDT);
	}

	static void
	emitX86DeclareSimdFunction(const FunctionDecl FD, llvm::Function Fn,
	const llvm::APSInt &VLENVal,
	ArrayRef<ParamAttrTy> ParamAttrs,
	OMPDeclareSimdDeclAttr::BranchStateTy State) {
	struct ISADataTy {
	char ISA;
	unsigned VecRegSize;
	};
	ISADataTy ISAData[] = {
	{
	'b', 128
	}, // SSE
	{
	'c', 256
	}, // AVX
	{
	'd', 256
	}, // AVX2
	{
	'e', 512
	}, // AVX512
	};
	llvm::SmallVector<char, 2> Masked;
	switch (State) {
	case OMPDeclareSimdDeclAttr::BS_Undefined:
	Masked.push_back('N');
	Masked.push_back('M');
	break;
	case OMPDeclareSimdDeclAttr::BS_Notinbranch:
	Masked.push_back('N');
	break;
	case OMPDeclareSimdDeclAttr::BS_Inbranch:
	Masked.push_back('M');
	break;
	}
	for (auto Mask : Masked) {
	for (auto &Data : ISAData) {
	SmallString<256> Buffer;
	llvm::raw_svector_ostream Out(Buffer);
	Out << "_ZGV" << Data.ISA << Mask;
	if (!VLENVal) {
	Out << llvm::APSInt::getUnsigned(Data.VecRegSize /
	evaluateCDTSize(FD, ParamAttrs));
	} else
	Out << VLENVal;
	for (auto &ParamAttr : ParamAttrs) {
	switch (ParamAttr.Kind){
	case LinearWithVarStride:
	Out << 's' << ParamAttr.StrideOrArg;
	break;
	case Linear:
	Out << 'l';
	if (!!ParamAttr.StrideOrArg)
	Out << ParamAttr.StrideOrArg;
	break;
	case Uniform:
	Out << 'u';
	break;
	case Vector:
	Out << 'v';
	break;
	}
	if (!!ParamAttr.Alignment)
	Out << 'a' << ParamAttr.Alignment;
	}
	Out << '_' << Fn->getName();
	Fn->addFnAttr(Out.str());
	}
	}
	}

	void CGOpenMPRuntime::emitDeclareSimdFunction(const FunctionDecl *FD,
	llvm::Function *Fn) {
	ASTContext &C = CGM.getContext();
	FD = FD->getCanonicalDecl();
	// Map params to their positions in function decl.
	llvm::DenseMap<const Decl *, unsigned> ParamPositions;
	if (isa<CXXMethodDecl>(FD))
	ParamPositions.insert({FD, 0});
	unsigned ParamPos = ParamPositions.size();
	for (auto *P : FD->parameters()) {
	ParamPositions.insert({P->getCanonicalDecl(), ParamPos});
	++ParamPos;
	}
	for (auto *Attr : FD->specific_attrs<OMPDeclareSimdDeclAttr>()) {
	llvm::SmallVector<ParamAttrTy, 8> ParamAttrs(ParamPositions.size());
	// Mark uniform parameters.
	for (auto *E : Attr->uniforms()) {
	E = E->IgnoreParenImpCasts();
	unsigned Pos;
	if (isa<CXXThisExpr>(E))
	Pos = ParamPositions[FD];
	else {
	auto *PVD = cast<ParmVarDecl>(cast<DeclRefExpr>(E)->getDecl())
	->getCanonicalDecl();
	Pos = ParamPositions[PVD];
	}
	ParamAttrs[Pos].Kind = Uniform;
	}
	// Get alignment info.
	auto NI = Attr->alignments_begin();
	for (auto *E : Attr->aligneds()) {
	E = E->IgnoreParenImpCasts();
	unsigned Pos;
	QualType ParmTy;
	if (isa<CXXThisExpr>(E)) {
	Pos = ParamPositions[FD];
	ParmTy = E->getType();
	} else {
	auto *PVD = cast<ParmVarDecl>(cast<DeclRefExpr>(E)->getDecl())
	->getCanonicalDecl();
	Pos = ParamPositions[PVD];
	ParmTy = PVD->getType();
	}
	ParamAttrs[Pos].Alignment =
	(NI) ? (NI)->EvaluateKnownConstInt(C)
	: llvm::APSInt::getUnsigned(
	C.toCharUnitsFromBits(C.getOpenMPDefaultSimdAlign(ParmTy))
	.getQuantity());
	++NI;
	}
	// Mark linear parameters.
	auto SI = Attr->steps_begin();
	auto MI = Attr->modifiers_begin();
	for (auto *E : Attr->linears()) {
	E = E->IgnoreParenImpCasts();
	unsigned Pos;
	if (isa<CXXThisExpr>(E))
	Pos = ParamPositions[FD];
	else {
	auto *PVD = cast<ParmVarDecl>(cast<DeclRefExpr>(E)->getDecl())
	->getCanonicalDecl();
	Pos = ParamPositions[PVD];
	}
	auto &ParamAttr = ParamAttrs[Pos];
	ParamAttr.Kind = Linear;
	if (*SI) {
	if (!(*SI)->EvaluateAsInt(ParamAttr.StrideOrArg, C,
	Expr::SE_AllowSideEffects)) {
	if (auto DRE = cast<DeclRefExpr>((SI)->IgnoreParenImpCasts())) {
	if (auto *StridePVD = cast<ParmVarDecl>(DRE->getDecl())) {
	ParamAttr.Kind = LinearWithVarStride;
	ParamAttr.StrideOrArg = llvm::APSInt::getUnsigned(
	ParamPositions[StridePVD->getCanonicalDecl()]);
	}
	}
	}
	}
	++SI;
	++MI;
	}
	llvm::APSInt VLENVal;
	if (const Expr *VLEN = Attr->getSimdlen())
	VLENVal = VLEN->EvaluateKnownConstInt(C);
	OMPDeclareSimdDeclAttr::BranchStateTy State = Attr->getBranchState();
	if (CGM.getTriple().getArch() == llvm::Triple::x86 \|\|
	CGM.getTriple().getArch() == llvm::Triple::x86_64)
	emitX86DeclareSimdFunction(FD, Fn, VLENVal, ParamAttrs, State);
	}
	}

	namespace {
	/// Cleanup action for doacross support.
	class DoacrossCleanupTy final : public EHScopeStack::Cleanup {
	public:
	static const int DoacrossFinArgs = 2;

	private:
	llvm::Value *RTLFn;
	llvm::Value *Args[DoacrossFinArgs];

	public:
	DoacrossCleanupTy(llvm::Value RTLFn, ArrayRef<llvm::Value > CallArgs)
	: RTLFn(RTLFn) {
	assert(CallArgs.size() == DoacrossFinArgs);
	std::copy(CallArgs.begin(), CallArgs.end(), std::begin(Args));
	}
	void Emit(CodeGenFunction &CGF, Flags /flags/) override {
	if (!CGF.HaveInsertPoint())
	return;
	CGF.EmitRuntimeCall(RTLFn, Args);
	}
	};
	} // namespace

	void CGOpenMPRuntime::emitDoacrossInit(CodeGenFunction &CGF,
	const OMPLoopDirective &D) {
	if (!CGF.HaveInsertPoint())
	return;

	ASTContext &C = CGM.getContext();
	QualType Int64Ty = C.getIntTypeForBitwidth(/DestWidth=/64, /Signed=/true);
	RecordDecl *RD;
	if (KmpDimTy.isNull()) {
	// Build struct kmp_dim { // loop bounds info casted to kmp_int64
	// kmp_int64 lo; // lower
	// kmp_int64 up; // upper
	// kmp_int64 st; // stride
	// };
	RD = C.buildImplicitRecord("kmp_dim");
	RD->startDefinition();
	addFieldToRecordDecl(C, RD, Int64Ty);
	addFieldToRecordDecl(C, RD, Int64Ty);
	addFieldToRecordDecl(C, RD, Int64Ty);
	RD->completeDefinition();
	KmpDimTy = C.getRecordType(RD);
	} else
	RD = cast<RecordDecl>(KmpDimTy->getAsTagDecl());

	Address DimsAddr = CGF.CreateMemTemp(KmpDimTy, "dims");
	CGF.EmitNullInitialization(DimsAddr, KmpDimTy);
	enum { LowerFD = 0, UpperFD, StrideFD };
	// Fill dims with data.
	LValue DimsLVal = CGF.MakeAddrLValue(DimsAddr, KmpDimTy);
	// dims.upper = num_iterations;
	LValue UpperLVal =
	CGF.EmitLValueForField(DimsLVal, *std::next(RD->field_begin(), UpperFD));
	llvm::Value *NumIterVal = CGF.EmitScalarConversion(
	CGF.EmitScalarExpr(D.getNumIterations()), D.getNumIterations()->getType(),
	Int64Ty, D.getNumIterations()->getExprLoc());
	CGF.EmitStoreOfScalar(NumIterVal, UpperLVal);
	// dims.stride = 1;
	LValue StrideLVal =
	CGF.EmitLValueForField(DimsLVal, *std::next(RD->field_begin(), StrideFD));
	CGF.EmitStoreOfScalar(llvm::ConstantInt::getSigned(CGM.Int64Ty, /V=/1),
	StrideLVal);

	// Build call void __kmpc_doacross_init(ident_t *loc, kmp_int32 gtid,
	// kmp_int32 num_dims, struct kmp_dim * dims);
	llvm::Value *Args[] = {emitUpdateLocation(CGF, D.getLocStart()),
	getThreadID(CGF, D.getLocStart()),
	llvm::ConstantInt::getSigned(CGM.Int32Ty, 1),
	CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
	DimsAddr.getPointer(), CGM.VoidPtrTy)};

	llvm::Value *RTLFn = createRuntimeFunction(OMPRTL__kmpc_doacross_init);
	CGF.EmitRuntimeCall(RTLFn, Args);
	llvm::Value *FiniArgs[DoacrossCleanupTy::DoacrossFinArgs] = {
	emitUpdateLocation(CGF, D.getLocEnd()), getThreadID(CGF, D.getLocEnd())};
	llvm::Value *FiniRTLFn = createRuntimeFunction(OMPRTL__kmpc_doacross_fini);
	CGF.EHStack.pushCleanup<DoacrossCleanupTy>(NormalAndEHCleanup, FiniRTLFn,
	llvm::makeArrayRef(FiniArgs));
	}

	void CGOpenMPRuntime::emitDoacrossOrdered(CodeGenFunction &CGF,
	const OMPDependClause *C) {
	QualType Int64Ty =
	CGM.getContext().getIntTypeForBitwidth(/DestWidth=/64, /Signed=/1);
	const Expr *CounterVal = C->getCounterValue();
	assert(CounterVal);
	llvm::Value *CntVal = CGF.EmitScalarConversion(CGF.EmitScalarExpr(CounterVal),
	CounterVal->getType(), Int64Ty,
	CounterVal->getExprLoc());
	Address CntAddr = CGF.CreateMemTemp(Int64Ty, ".cnt.addr");
	CGF.EmitStoreOfScalar(CntVal, CntAddr, /Volatile=/false, Int64Ty);
	llvm::Value *Args[] = {emitUpdateLocation(CGF, C->getLocStart()),
	getThreadID(CGF, C->getLocStart()),
	CntAddr.getPointer()};
	llvm::Value *RTLFn;
	if (C->getDependencyKind() == OMPC_DEPEND_source)
	RTLFn = createRuntimeFunction(OMPRTL__kmpc_doacross_post);
	else {
	assert(C->getDependencyKind() == OMPC_DEPEND_sink);
	RTLFn = createRuntimeFunction(OMPRTL__kmpc_doacross_wait);
	}
	CGF.EmitRuntimeCall(RTLFn, Args);
	}

	Index: projects/clang400-import/contrib/llvm/tools/clang/lib/Frontend/InitPreprocessor.cpp
	===================================================================
	--- projects/clang400-import/contrib/llvm/tools/clang/lib/Frontend/InitPreprocessor.cpp (revision 314268)
	+++ projects/clang400-import/contrib/llvm/tools/clang/lib/Frontend/InitPreprocessor.cpp (revision 314269)
	@@ -1,1100 +1,1102 @@
	//===--- InitPreprocessor.cpp - PP initialization code. ---------- C++ --===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	//
	// This file implements the clang::InitializePreprocessor function.
	//
	//===----------------------------------------------------------------------===//

	#include "clang/Basic/FileManager.h"
	#include "clang/Basic/MacroBuilder.h"
	#include "clang/Basic/SourceManager.h"
	#include "clang/Basic/TargetInfo.h"
	#include "clang/Basic/Version.h"
	#include "clang/Frontend/FrontendDiagnostic.h"
	#include "clang/Frontend/FrontendOptions.h"
	#include "clang/Frontend/Utils.h"
	#include "clang/Lex/HeaderSearch.h"
	#include "clang/Lex/PTHManager.h"
	#include "clang/Lex/Preprocessor.h"
	#include "clang/Lex/PreprocessorOptions.h"
	#include "clang/Serialization/ASTReader.h"
	#include "llvm/ADT/APFloat.h"
	using namespace clang;

	static bool MacroBodyEndsInBackslash(StringRef MacroBody) {
	while (!MacroBody.empty() && isWhitespace(MacroBody.back()))
	MacroBody = MacroBody.drop_back();
	return !MacroBody.empty() && MacroBody.back() == '\\';
	}

	// Append a #define line to Buf for Macro. Macro should be of the form XXX,
	// in which case we emit "#define XXX 1" or "XXX=Y z W" in which case we emit
	// "#define XXX Y z W". To get a #define with no value, use "XXX=".
	static void DefineBuiltinMacro(MacroBuilder &Builder, StringRef Macro,
	DiagnosticsEngine &Diags) {
	std::pair<StringRef, StringRef> MacroPair = Macro.split('=');
	StringRef MacroName = MacroPair.first;
	StringRef MacroBody = MacroPair.second;
	if (MacroName.size() != Macro.size()) {
	// Per GCC -D semantics, the macro ends at \n if it exists.
	StringRef::size_type End = MacroBody.find_first_of("\n\r");
	if (End != StringRef::npos)
	Diags.Report(diag::warn_fe_macro_contains_embedded_newline)
	<< MacroName;
	MacroBody = MacroBody.substr(0, End);
	// We handle macro bodies which end in a backslash by appending an extra
	// backslash+newline. This makes sure we don't accidentally treat the
	// backslash as a line continuation marker.
	if (MacroBodyEndsInBackslash(MacroBody))
	Builder.defineMacro(MacroName, Twine(MacroBody) + "\\\n");
	else
	Builder.defineMacro(MacroName, MacroBody);
	} else {
	// Push "macroname 1".
	Builder.defineMacro(Macro);
	}
	}

	/// AddImplicitInclude - Add an implicit \#include of the specified file to the
	/// predefines buffer.
	/// As these includes are generated by -include arguments the header search
	/// logic is going to search relatively to the current working directory.
	static void AddImplicitInclude(MacroBuilder &Builder, StringRef File) {
	Builder.append(Twine("#include \"") + File + "\"");
	}

	static void AddImplicitIncludeMacros(MacroBuilder &Builder, StringRef File) {
	Builder.append(Twine("#__include_macros \"") + File + "\"");
	// Marker token to stop the __include_macros fetch loop.
	Builder.append("##"); // ##?
	}

	/// AddImplicitIncludePTH - Add an implicit \#include using the original file
	/// used to generate a PTH cache.
	static void AddImplicitIncludePTH(MacroBuilder &Builder, Preprocessor &PP,
	StringRef ImplicitIncludePTH) {
	PTHManager *P = PP.getPTHManager();
	// Null check 'P' in the corner case where it couldn't be created.
	const char *OriginalFile = P ? P->getOriginalSourceFile() : nullptr;

	if (!OriginalFile) {
	PP.getDiagnostics().Report(diag::err_fe_pth_file_has_no_source_header)
	<< ImplicitIncludePTH;
	return;
	}

	AddImplicitInclude(Builder, OriginalFile);
	}

	/// \brief Add an implicit \#include using the original file used to generate
	/// a PCH file.
	static void AddImplicitIncludePCH(MacroBuilder &Builder, Preprocessor &PP,
	const PCHContainerReader &PCHContainerRdr,
	StringRef ImplicitIncludePCH) {
	std::string OriginalFile =
	ASTReader::getOriginalSourceFile(ImplicitIncludePCH, PP.getFileManager(),
	PCHContainerRdr, PP.getDiagnostics());
	if (OriginalFile.empty())
	return;

	AddImplicitInclude(Builder, OriginalFile);
	}

	/// PickFP - This is used to pick a value based on the FP semantics of the
	/// specified FP model.
	template <typename T>
	static T PickFP(const llvm::fltSemantics *Sem, T IEEESingleVal,
	T IEEEDoubleVal, T X87DoubleExtendedVal, T PPCDoubleDoubleVal,
	T IEEEQuadVal) {
	if (Sem == (const llvm::fltSemantics*)&llvm::APFloat::IEEEsingle())
	return IEEESingleVal;
	if (Sem == (const llvm::fltSemantics*)&llvm::APFloat::IEEEdouble())
	return IEEEDoubleVal;
	if (Sem == (const llvm::fltSemantics*)&llvm::APFloat::x87DoubleExtended())
	return X87DoubleExtendedVal;
	if (Sem == (const llvm::fltSemantics*)&llvm::APFloat::PPCDoubleDouble())
	return PPCDoubleDoubleVal;
	assert(Sem == (const llvm::fltSemantics*)&llvm::APFloat::IEEEquad());
	return IEEEQuadVal;
	}

	static void DefineFloatMacros(MacroBuilder &Builder, StringRef Prefix,
	const llvm::fltSemantics *Sem, StringRef Ext) {
	const char DenormMin, Epsilon, Max, Min;
	DenormMin = PickFP(Sem, "1.40129846e-45", "4.9406564584124654e-324",
	"3.64519953188247460253e-4951",
	"4.94065645841246544176568792868221e-324",
	"6.47517511943802511092443895822764655e-4966");
	int Digits = PickFP(Sem, 6, 15, 18, 31, 33);
	int DecimalDigits = PickFP(Sem, 9, 17, 21, 33, 36);
	Epsilon = PickFP(Sem, "1.19209290e-7", "2.2204460492503131e-16",
	"1.08420217248550443401e-19",
	"4.94065645841246544176568792868221e-324",
	"1.92592994438723585305597794258492732e-34");
	int MantissaDigits = PickFP(Sem, 24, 53, 64, 106, 113);
	int Min10Exp = PickFP(Sem, -37, -307, -4931, -291, -4931);
	int Max10Exp = PickFP(Sem, 38, 308, 4932, 308, 4932);
	int MinExp = PickFP(Sem, -125, -1021, -16381, -968, -16381);
	int MaxExp = PickFP(Sem, 128, 1024, 16384, 1024, 16384);
	Min = PickFP(Sem, "1.17549435e-38", "2.2250738585072014e-308",
	"3.36210314311209350626e-4932",
	"2.00416836000897277799610805135016e-292",
	"3.36210314311209350626267781732175260e-4932");
	Max = PickFP(Sem, "3.40282347e+38", "1.7976931348623157e+308",
	"1.18973149535723176502e+4932",
	"1.79769313486231580793728971405301e+308",
	"1.18973149535723176508575932662800702e+4932");

	SmallString<32> DefPrefix;
	DefPrefix = "__";
	DefPrefix += Prefix;
	DefPrefix += "_";

	Builder.defineMacro(DefPrefix + "DENORM_MIN__", Twine(DenormMin)+Ext);
	Builder.defineMacro(DefPrefix + "HAS_DENORM__");
	Builder.defineMacro(DefPrefix + "DIG__", Twine(Digits));
	Builder.defineMacro(DefPrefix + "DECIMAL_DIG__", Twine(DecimalDigits));
	Builder.defineMacro(DefPrefix + "EPSILON__", Twine(Epsilon)+Ext);
	Builder.defineMacro(DefPrefix + "HAS_INFINITY__");
	Builder.defineMacro(DefPrefix + "HAS_QUIET_NAN__");
	Builder.defineMacro(DefPrefix + "MANT_DIG__", Twine(MantissaDigits));

	Builder.defineMacro(DefPrefix + "MAX_10_EXP__", Twine(Max10Exp));
	Builder.defineMacro(DefPrefix + "MAX_EXP__", Twine(MaxExp));
	Builder.defineMacro(DefPrefix + "MAX__", Twine(Max)+Ext);

	Builder.defineMacro(DefPrefix + "MIN_10_EXP__","("+Twine(Min10Exp)+")");
	Builder.defineMacro(DefPrefix + "MIN_EXP__", "("+Twine(MinExp)+")");
	Builder.defineMacro(DefPrefix + "MIN__", Twine(Min)+Ext);
	}


	/// DefineTypeSize - Emit a macro to the predefines buffer that declares a macro
	/// named MacroName with the max value for a type with width 'TypeWidth' a
	/// signedness of 'isSigned' and with a value suffix of 'ValSuffix' (e.g. LL).
	static void DefineTypeSize(const Twine &MacroName, unsigned TypeWidth,
	StringRef ValSuffix, bool isSigned,
	MacroBuilder &Builder) {
	llvm::APInt MaxVal = isSigned ? llvm::APInt::getSignedMaxValue(TypeWidth)
	: llvm::APInt::getMaxValue(TypeWidth);
	Builder.defineMacro(MacroName, MaxVal.toString(10, isSigned) + ValSuffix);
	}

	/// DefineTypeSize - An overloaded helper that uses TargetInfo to determine
	/// the width, suffix, and signedness of the given type
	static void DefineTypeSize(const Twine &MacroName, TargetInfo::IntType Ty,
	const TargetInfo &TI, MacroBuilder &Builder) {
	DefineTypeSize(MacroName, TI.getTypeWidth(Ty), TI.getTypeConstantSuffix(Ty),
	TI.isTypeSigned(Ty), Builder);
	}

	static void DefineFmt(const Twine &Prefix, TargetInfo::IntType Ty,
	const TargetInfo &TI, MacroBuilder &Builder) {
	bool IsSigned = TI.isTypeSigned(Ty);
	StringRef FmtModifier = TI.getTypeFormatModifier(Ty);
	for (const char Fmt = IsSigned ? "di" : "ouxX"; Fmt; ++Fmt) {
	Builder.defineMacro(Prefix + "_FMT" + Twine(*Fmt) + "__",
	Twine("\"") + FmtModifier + Twine(*Fmt) + "\"");
	}
	}

	static void DefineType(const Twine &MacroName, TargetInfo::IntType Ty,
	MacroBuilder &Builder) {
	Builder.defineMacro(MacroName, TargetInfo::getTypeName(Ty));
	}

	static void DefineTypeWidth(StringRef MacroName, TargetInfo::IntType Ty,
	const TargetInfo &TI, MacroBuilder &Builder) {
	Builder.defineMacro(MacroName, Twine(TI.getTypeWidth(Ty)));
	}

	static void DefineTypeSizeof(StringRef MacroName, unsigned BitWidth,
	const TargetInfo &TI, MacroBuilder &Builder) {
	Builder.defineMacro(MacroName,
	Twine(BitWidth / TI.getCharWidth()));
	}

	static void DefineExactWidthIntType(TargetInfo::IntType Ty,
	const TargetInfo &TI,
	MacroBuilder &Builder) {
	int TypeWidth = TI.getTypeWidth(Ty);
	bool IsSigned = TI.isTypeSigned(Ty);

	// Use the target specified int64 type, when appropriate, so that [u]int64_t
	// ends up being defined in terms of the correct type.
	if (TypeWidth == 64)
	Ty = IsSigned ? TI.getInt64Type() : TI.getUInt64Type();

	const char *Prefix = IsSigned ? "__INT" : "__UINT";

	DefineType(Prefix + Twine(TypeWidth) + "_TYPE__", Ty, Builder);
	DefineFmt(Prefix + Twine(TypeWidth), Ty, TI, Builder);

	StringRef ConstSuffix(TI.getTypeConstantSuffix(Ty));
	Builder.defineMacro(Prefix + Twine(TypeWidth) + "_C_SUFFIX__", ConstSuffix);
	}

	static void DefineExactWidthIntTypeSize(TargetInfo::IntType Ty,
	const TargetInfo &TI,
	MacroBuilder &Builder) {
	int TypeWidth = TI.getTypeWidth(Ty);
	bool IsSigned = TI.isTypeSigned(Ty);

	// Use the target specified int64 type, when appropriate, so that [u]int64_t
	// ends up being defined in terms of the correct type.
	if (TypeWidth == 64)
	Ty = IsSigned ? TI.getInt64Type() : TI.getUInt64Type();

	const char *Prefix = IsSigned ? "__INT" : "__UINT";
	DefineTypeSize(Prefix + Twine(TypeWidth) + "_MAX__", Ty, TI, Builder);
	}

	static void DefineLeastWidthIntType(unsigned TypeWidth, bool IsSigned,
	const TargetInfo &TI,
	MacroBuilder &Builder) {
	TargetInfo::IntType Ty = TI.getLeastIntTypeByWidth(TypeWidth, IsSigned);
	if (Ty == TargetInfo::NoInt)
	return;

	const char *Prefix = IsSigned ? "__INT_LEAST" : "__UINT_LEAST";
	DefineType(Prefix + Twine(TypeWidth) + "_TYPE__", Ty, Builder);
	DefineTypeSize(Prefix + Twine(TypeWidth) + "_MAX__", Ty, TI, Builder);
	DefineFmt(Prefix + Twine(TypeWidth), Ty, TI, Builder);
	}

	static void DefineFastIntType(unsigned TypeWidth, bool IsSigned,
	const TargetInfo &TI, MacroBuilder &Builder) {
	// stdint.h currently defines the fast int types as equivalent to the least
	// types.
	TargetInfo::IntType Ty = TI.getLeastIntTypeByWidth(TypeWidth, IsSigned);
	if (Ty == TargetInfo::NoInt)
	return;

	const char *Prefix = IsSigned ? "__INT_FAST" : "__UINT_FAST";
	DefineType(Prefix + Twine(TypeWidth) + "_TYPE__", Ty, Builder);
	DefineTypeSize(Prefix + Twine(TypeWidth) + "_MAX__", Ty, TI, Builder);

	DefineFmt(Prefix + Twine(TypeWidth), Ty, TI, Builder);
	}


	/// Get the value the ATOMIC_*_LOCK_FREE macro should have for a type with
	/// the specified properties.
	-static const char *getLockFreeValue(unsigned TypeWidth, unsigned InlineWidth) {
	+static const char *getLockFreeValue(unsigned TypeWidth, unsigned TypeAlign,
	+ unsigned InlineWidth) {
	// Fully-aligned, power-of-2 sizes no larger than the inline
	// width will be inlined as lock-free operations.
	- // Note: we do not need to check alignment since _Atomic(T) is always
	- // appropriately-aligned in clang.
	- if ((TypeWidth & (TypeWidth - 1)) == 0 && TypeWidth <= InlineWidth)
	+ if (TypeWidth == TypeAlign && (TypeWidth & (TypeWidth - 1)) == 0 &&
	+ TypeWidth <= InlineWidth)
	return "2"; // "always lock free"
	// We cannot be certain what operations the lib calls might be
	// able to implement as lock-free on future processors.
	return "1"; // "sometimes lock free"
	}

	/// \brief Add definitions required for a smooth interaction between
	/// Objective-C++ automated reference counting and libstdc++ (4.2).
	static void AddObjCXXARCLibstdcxxDefines(const LangOptions &LangOpts,
	MacroBuilder &Builder) {
	Builder.defineMacro("_GLIBCXX_PREDEFINED_OBJC_ARC_IS_SCALAR");

	std::string Result;
	{
	// Provide specializations for the __is_scalar type trait so that
	// lifetime-qualified objects are not considered "scalar" types, which
	// libstdc++ uses as an indicator of the presence of trivial copy, assign,
	// default-construct, and destruct semantics (none of which hold for
	// lifetime-qualified objects in ARC).
	llvm::raw_string_ostream Out(Result);

	Out << "namespace std {\n"
	<< "\n"
	<< "struct __true_type;\n"
	<< "struct __false_type;\n"
	<< "\n";

	Out << "template<typename _Tp> struct __is_scalar;\n"
	<< "\n";

	if (LangOpts.ObjCAutoRefCount) {
	Out << "template<typename _Tp>\n"
	<< "struct __is_scalar<__attribute__((objc_ownership(strong))) _Tp> {\n"
	<< " enum { __value = 0 };\n"
	<< " typedef __false_type __type;\n"
	<< "};\n"
	<< "\n";
	}

	if (LangOpts.ObjCWeak) {
	Out << "template<typename _Tp>\n"
	<< "struct __is_scalar<__attribute__((objc_ownership(weak))) _Tp> {\n"
	<< " enum { __value = 0 };\n"
	<< " typedef __false_type __type;\n"
	<< "};\n"
	<< "\n";
	}

	if (LangOpts.ObjCAutoRefCount) {
	Out << "template<typename _Tp>\n"
	<< "struct __is_scalar<__attribute__((objc_ownership(autoreleasing)))"
	<< " _Tp> {\n"
	<< " enum { __value = 0 };\n"
	<< " typedef __false_type __type;\n"
	<< "};\n"
	<< "\n";
	}

	Out << "}\n";
	}
	Builder.append(Result);
	}

	static void InitializeStandardPredefinedMacros(const TargetInfo &TI,
	const LangOptions &LangOpts,
	const FrontendOptions &FEOpts,
	MacroBuilder &Builder) {
	if (!LangOpts.MSVCCompat && !LangOpts.TraditionalCPP)
	Builder.defineMacro("__STDC__");
	if (LangOpts.Freestanding)
	Builder.defineMacro("__STDC_HOSTED__", "0");
	else
	Builder.defineMacro("__STDC_HOSTED__");

	if (!LangOpts.CPlusPlus) {
	if (LangOpts.C11)
	Builder.defineMacro("__STDC_VERSION__", "201112L");
	else if (LangOpts.C99)
	Builder.defineMacro("__STDC_VERSION__", "199901L");
	else if (!LangOpts.GNUMode && LangOpts.Digraphs)
	Builder.defineMacro("__STDC_VERSION__", "199409L");
	} else {
	// FIXME: Use correct value for C++17.
	if (LangOpts.CPlusPlus1z)
	Builder.defineMacro("__cplusplus", "201406L");
	// C++1y [cpp.predefined]p1:
	// The name __cplusplus is defined to the value 201402L when compiling a
	// C++ translation unit.
	else if (LangOpts.CPlusPlus14)
	Builder.defineMacro("__cplusplus", "201402L");
	// C++11 [cpp.predefined]p1:
	// The name __cplusplus is defined to the value 201103L when compiling a
	// C++ translation unit.
	else if (LangOpts.CPlusPlus11)
	Builder.defineMacro("__cplusplus", "201103L");
	// C++03 [cpp.predefined]p1:
	// The name __cplusplus is defined to the value 199711L when compiling a
	// C++ translation unit.
	else
	Builder.defineMacro("__cplusplus", "199711L");

	// C++1z [cpp.predefined]p1:
	// An integer literal of type std::size_t whose value is the alignment
	// guaranteed by a call to operator new(std::size_t)
	//
	// We provide this in all language modes, since it seems generally useful.
	Builder.defineMacro("__STDCPP_DEFAULT_NEW_ALIGNMENT__",
	Twine(TI.getNewAlign() / TI.getCharWidth()) +
	TI.getTypeConstantSuffix(TI.getSizeType()));
	}

	// In C11 these are environment macros. In C++11 they are only defined
	// as part of <cuchar>. To prevent breakage when mixing C and C++
	// code, define these macros unconditionally. We can define them
	// unconditionally, as Clang always uses UTF-16 and UTF-32 for 16-bit
	// and 32-bit character literals.
	Builder.defineMacro("__STDC_UTF_16__", "1");
	Builder.defineMacro("__STDC_UTF_32__", "1");

	if (LangOpts.ObjC1)
	Builder.defineMacro("__OBJC__");

	// OpenCL v1.0/1.1 s6.9, v1.2/2.0 s6.10: Preprocessor Directives and Macros.
	if (LangOpts.OpenCL) {
	// OpenCL v1.0 and v1.1 do not have a predefined macro to indicate the
	// language standard with which the program is compiled. __OPENCL_VERSION__
	// is for the OpenCL version supported by the OpenCL device, which is not
	// necessarily the language standard with which the program is compiled.
	// A shared OpenCL header file requires a macro to indicate the language
	// standard. As a workaround, __OPENCL_C_VERSION__ is defined for
	// OpenCL v1.0 and v1.1.
	switch (LangOpts.OpenCLVersion) {
	case 100:
	Builder.defineMacro("__OPENCL_C_VERSION__", "100");
	break;
	case 110:
	Builder.defineMacro("__OPENCL_C_VERSION__", "110");
	break;
	case 120:
	Builder.defineMacro("__OPENCL_C_VERSION__", "120");
	break;
	case 200:
	Builder.defineMacro("__OPENCL_C_VERSION__", "200");
	break;
	default:
	llvm_unreachable("Unsupported OpenCL version");
	}
	Builder.defineMacro("CL_VERSION_1_0", "100");
	Builder.defineMacro("CL_VERSION_1_1", "110");
	Builder.defineMacro("CL_VERSION_1_2", "120");
	Builder.defineMacro("CL_VERSION_2_0", "200");

	if (TI.isLittleEndian())
	Builder.defineMacro("__ENDIAN_LITTLE__");

	if (LangOpts.FastRelaxedMath)
	Builder.defineMacro("__FAST_RELAXED_MATH__");
	}
	// Not "standard" per se, but available even with the -undef flag.
	if (LangOpts.AsmPreprocessor)
	Builder.defineMacro("__ASSEMBLER__");
	if (LangOpts.CUDA)
	Builder.defineMacro("__CUDA__");
	}

	/// Initialize the predefined C++ language feature test macros defined in
	/// ISO/IEC JTC1/SC22/WG21 (C++) SD-6: "SG10 Feature Test Recommendations".
	static void InitializeCPlusPlusFeatureTestMacros(const LangOptions &LangOpts,
	MacroBuilder &Builder) {
	// C++98 features.
	if (LangOpts.RTTI)
	Builder.defineMacro("__cpp_rtti", "199711");
	if (LangOpts.CXXExceptions)
	Builder.defineMacro("__cpp_exceptions", "199711");

	// C++11 features.
	if (LangOpts.CPlusPlus11) {
	Builder.defineMacro("__cpp_unicode_characters", "200704");
	Builder.defineMacro("__cpp_raw_strings", "200710");
	Builder.defineMacro("__cpp_unicode_literals", "200710");
	Builder.defineMacro("__cpp_user_defined_literals", "200809");
	Builder.defineMacro("__cpp_lambdas", "200907");
	Builder.defineMacro("__cpp_constexpr",
	LangOpts.CPlusPlus14 ? "201304" : "200704");
	Builder.defineMacro("__cpp_range_based_for",
	LangOpts.CPlusPlus1z ? "201603" : "200907");
	Builder.defineMacro("__cpp_static_assert",
	LangOpts.CPlusPlus1z ? "201411" : "200410");
	Builder.defineMacro("__cpp_decltype", "200707");
	Builder.defineMacro("__cpp_attributes", "200809");
	Builder.defineMacro("__cpp_rvalue_references", "200610");
	Builder.defineMacro("__cpp_variadic_templates", "200704");
	Builder.defineMacro("__cpp_initializer_lists", "200806");
	Builder.defineMacro("__cpp_delegating_constructors", "200604");
	Builder.defineMacro("__cpp_nsdmi", "200809");
	Builder.defineMacro("__cpp_inheriting_constructors", "201511");
	Builder.defineMacro("__cpp_ref_qualifiers", "200710");
	Builder.defineMacro("__cpp_alias_templates", "200704");
	}

	// C++14 features.
	if (LangOpts.CPlusPlus14) {
	Builder.defineMacro("__cpp_binary_literals", "201304");
	Builder.defineMacro("__cpp_digit_separators", "201309");
	Builder.defineMacro("__cpp_init_captures", "201304");
	Builder.defineMacro("__cpp_generic_lambdas", "201304");
	Builder.defineMacro("__cpp_decltype_auto", "201304");
	Builder.defineMacro("__cpp_return_type_deduction", "201304");
	Builder.defineMacro("__cpp_aggregate_nsdmi", "201304");
	Builder.defineMacro("__cpp_variable_templates", "201304");
	}
	if (LangOpts.SizedDeallocation)
	Builder.defineMacro("__cpp_sized_deallocation", "201309");

	// C++17 features.
	if (LangOpts.CPlusPlus1z) {
	Builder.defineMacro("__cpp_hex_float", "201603");
	Builder.defineMacro("__cpp_inline_variables", "201606");
	Builder.defineMacro("__cpp_noexcept_function_type", "201510");
	Builder.defineMacro("__cpp_capture_star_this", "201603");
	Builder.defineMacro("__cpp_if_constexpr", "201606");
	Builder.defineMacro("__cpp_template_auto", "201606");
	Builder.defineMacro("__cpp_namespace_attributes", "201411");
	Builder.defineMacro("__cpp_enumerator_attributes", "201411");
	Builder.defineMacro("__cpp_nested_namespace_definitions", "201411");
	Builder.defineMacro("__cpp_variadic_using", "201611");
	Builder.defineMacro("__cpp_aggregate_bases", "201603");
	Builder.defineMacro("__cpp_structured_bindings", "201606");
	Builder.defineMacro("__cpp_nontype_template_args", "201411");
	Builder.defineMacro("__cpp_fold_expressions", "201603");
	}
	if (LangOpts.AlignedAllocation)
	Builder.defineMacro("__cpp_aligned_new", "201606");

	// TS features.
	if (LangOpts.ConceptsTS)
	Builder.defineMacro("__cpp_experimental_concepts", "1");
	if (LangOpts.CoroutinesTS)
	Builder.defineMacro("__cpp_coroutines", "1");
	}

	static void InitializePredefinedMacros(const TargetInfo &TI,
	const LangOptions &LangOpts,
	const FrontendOptions &FEOpts,
	MacroBuilder &Builder) {
	// Compiler version introspection macros.
	Builder.defineMacro("__llvm__"); // LLVM Backend
	Builder.defineMacro("__clang__"); // Clang Frontend
	#define TOSTR2(X) #X
	#define TOSTR(X) TOSTR2(X)
	Builder.defineMacro("__clang_major__", TOSTR(CLANG_VERSION_MAJOR));
	Builder.defineMacro("__clang_minor__", TOSTR(CLANG_VERSION_MINOR));
	Builder.defineMacro("__clang_patchlevel__", TOSTR(CLANG_VERSION_PATCHLEVEL));
	#undef TOSTR
	#undef TOSTR2
	Builder.defineMacro("__clang_version__",
	"\"" CLANG_VERSION_STRING " "
	+ getClangFullRepositoryVersion() + "\"");
	if (!LangOpts.MSVCCompat) {
	// Currently claim to be compatible with GCC 4.2.1-5621, but only if we're
	// not compiling for MSVC compatibility
	Builder.defineMacro("__GNUC_MINOR__", "2");
	Builder.defineMacro("__GNUC_PATCHLEVEL__", "1");
	Builder.defineMacro("__GNUC__", "4");
	Builder.defineMacro("__GXX_ABI_VERSION", "1002");
	}

	// Define macros for the C11 / C++11 memory orderings
	Builder.defineMacro("__ATOMIC_RELAXED", "0");
	Builder.defineMacro("__ATOMIC_CONSUME", "1");
	Builder.defineMacro("__ATOMIC_ACQUIRE", "2");
	Builder.defineMacro("__ATOMIC_RELEASE", "3");
	Builder.defineMacro("__ATOMIC_ACQ_REL", "4");
	Builder.defineMacro("__ATOMIC_SEQ_CST", "5");

	// Support for #pragma redefine_extname (Sun compatibility)
	Builder.defineMacro("__PRAGMA_REDEFINE_EXTNAME", "1");

	// As sad as it is, enough software depends on the __VERSION__ for version
	// checks that it is necessary to report 4.2.1 (the base GCC version we claim
	// compatibility with) first.
	Builder.defineMacro("__VERSION__", "\"4.2.1 Compatible " +
	Twine(getClangFullCPPVersion()) + "\"");

	// Initialize language-specific preprocessor defines.

	// Standard conforming mode?
	if (!LangOpts.GNUMode && !LangOpts.MSVCCompat)
	Builder.defineMacro("__STRICT_ANSI__");

	if (!LangOpts.MSVCCompat && LangOpts.CPlusPlus11)
	Builder.defineMacro("__GXX_EXPERIMENTAL_CXX0X__");

	if (LangOpts.ObjC1) {
	if (LangOpts.ObjCRuntime.isNonFragile()) {
	Builder.defineMacro("__OBJC2__");

	if (LangOpts.ObjCExceptions)
	Builder.defineMacro("OBJC_ZEROCOST_EXCEPTIONS");
	}

	Builder.defineMacro("__OBJC_BOOL_IS_BOOL",
	Twine(TI.useSignedCharForObjCBool() ? "0" : "1"));

	if (LangOpts.getGC() != LangOptions::NonGC)
	Builder.defineMacro("__OBJC_GC__");

	if (LangOpts.ObjCRuntime.isNeXTFamily())
	Builder.defineMacro("__NEXT_RUNTIME__");

	if (LangOpts.ObjCRuntime.getKind() == ObjCRuntime::ObjFW) {
	VersionTuple tuple = LangOpts.ObjCRuntime.getVersion();

	unsigned minor = 0;
	if (tuple.getMinor().hasValue())
	minor = tuple.getMinor().getValue();

	unsigned subminor = 0;
	if (tuple.getSubminor().hasValue())
	subminor = tuple.getSubminor().getValue();

	Builder.defineMacro("__OBJFW_RUNTIME_ABI__",
	Twine(tuple.getMajor() * 10000 + minor * 100 +
	subminor));
	}

	Builder.defineMacro("IBOutlet", "__attribute__((iboutlet))");
	Builder.defineMacro("IBOutletCollection(ClassName)",
	"__attribute__((iboutletcollection(ClassName)))");
	Builder.defineMacro("IBAction", "void)__attribute__((ibaction)");
	Builder.defineMacro("IBInspectable", "");
	Builder.defineMacro("IB_DESIGNABLE", "");
	}

	if (LangOpts.CPlusPlus)
	InitializeCPlusPlusFeatureTestMacros(LangOpts, Builder);

	// darwin_constant_cfstrings controls this. This is also dependent
	// on other things like the runtime I believe. This is set even for C code.
	if (!LangOpts.NoConstantCFStrings)
	Builder.defineMacro("__CONSTANT_CFSTRINGS__");

	if (LangOpts.ObjC2)
	Builder.defineMacro("OBJC_NEW_PROPERTIES");

	if (LangOpts.PascalStrings)
	Builder.defineMacro("__PASCAL_STRINGS__");

	if (LangOpts.Blocks) {
	Builder.defineMacro("__block", "__attribute__((__blocks__(byref)))");
	Builder.defineMacro("__BLOCKS__");
	}

	if (!LangOpts.MSVCCompat && LangOpts.Exceptions)
	Builder.defineMacro("__EXCEPTIONS");
	if (!LangOpts.MSVCCompat && LangOpts.RTTI)
	Builder.defineMacro("__GXX_RTTI");
	if (LangOpts.SjLjExceptions)
	Builder.defineMacro("__USING_SJLJ_EXCEPTIONS__");

	if (LangOpts.Deprecated)
	Builder.defineMacro("__DEPRECATED");

	if (!LangOpts.MSVCCompat && LangOpts.CPlusPlus) {
	Builder.defineMacro("__GNUG__", "4");
	Builder.defineMacro("__GXX_WEAK__");
	Builder.defineMacro("__private_extern__", "extern");
	}

	if (LangOpts.MicrosoftExt) {
	if (LangOpts.WChar) {
	// wchar_t supported as a keyword.
	Builder.defineMacro("_WCHAR_T_DEFINED");
	Builder.defineMacro("_NATIVE_WCHAR_T_DEFINED");
	}
	}

	if (LangOpts.Optimize)
	Builder.defineMacro("__OPTIMIZE__");
	if (LangOpts.OptimizeSize)
	Builder.defineMacro("__OPTIMIZE_SIZE__");

	if (LangOpts.FastMath)
	Builder.defineMacro("__FAST_MATH__");

	// Initialize target-specific preprocessor defines.

	// __BYTE_ORDER__ was added in GCC 4.6. It's analogous
	// to the macro __BYTE_ORDER (no trailing underscores)
	// from glibc's <endian.h> header.
	// We don't support the PDP-11 as a target, but include
	// the define so it can still be compared against.
	Builder.defineMacro("__ORDER_LITTLE_ENDIAN__", "1234");
	Builder.defineMacro("__ORDER_BIG_ENDIAN__", "4321");
	Builder.defineMacro("__ORDER_PDP_ENDIAN__", "3412");
	if (TI.isBigEndian()) {
	Builder.defineMacro("__BYTE_ORDER__", "__ORDER_BIG_ENDIAN__");
	Builder.defineMacro("__BIG_ENDIAN__");
	} else {
	Builder.defineMacro("__BYTE_ORDER__", "__ORDER_LITTLE_ENDIAN__");
	Builder.defineMacro("__LITTLE_ENDIAN__");
	}

	if (TI.getPointerWidth(0) == 64 && TI.getLongWidth() == 64
	&& TI.getIntWidth() == 32) {
	Builder.defineMacro("_LP64");
	Builder.defineMacro("__LP64__");
	}

	if (TI.getPointerWidth(0) == 32 && TI.getLongWidth() == 32
	&& TI.getIntWidth() == 32) {
	Builder.defineMacro("_ILP32");
	Builder.defineMacro("__ILP32__");
	}

	// Define type sizing macros based on the target properties.
	assert(TI.getCharWidth() == 8 && "Only support 8-bit char so far");
	Builder.defineMacro("__CHAR_BIT__", Twine(TI.getCharWidth()));

	DefineTypeSize("__SCHAR_MAX__", TargetInfo::SignedChar, TI, Builder);
	DefineTypeSize("__SHRT_MAX__", TargetInfo::SignedShort, TI, Builder);
	DefineTypeSize("__INT_MAX__", TargetInfo::SignedInt, TI, Builder);
	DefineTypeSize("__LONG_MAX__", TargetInfo::SignedLong, TI, Builder);
	DefineTypeSize("__LONG_LONG_MAX__", TargetInfo::SignedLongLong, TI, Builder);
	DefineTypeSize("__WCHAR_MAX__", TI.getWCharType(), TI, Builder);
	DefineTypeSize("__INTMAX_MAX__", TI.getIntMaxType(), TI, Builder);
	DefineTypeSize("__SIZE_MAX__", TI.getSizeType(), TI, Builder);

	DefineTypeSize("__UINTMAX_MAX__", TI.getUIntMaxType(), TI, Builder);
	DefineTypeSize("__PTRDIFF_MAX__", TI.getPtrDiffType(0), TI, Builder);
	DefineTypeSize("__INTPTR_MAX__", TI.getIntPtrType(), TI, Builder);
	DefineTypeSize("__UINTPTR_MAX__", TI.getUIntPtrType(), TI, Builder);

	DefineTypeSizeof("__SIZEOF_DOUBLE__", TI.getDoubleWidth(), TI, Builder);
	DefineTypeSizeof("__SIZEOF_FLOAT__", TI.getFloatWidth(), TI, Builder);
	DefineTypeSizeof("__SIZEOF_INT__", TI.getIntWidth(), TI, Builder);
	DefineTypeSizeof("__SIZEOF_LONG__", TI.getLongWidth(), TI, Builder);
	DefineTypeSizeof("__SIZEOF_LONG_DOUBLE__",TI.getLongDoubleWidth(),TI,Builder);
	DefineTypeSizeof("__SIZEOF_LONG_LONG__", TI.getLongLongWidth(), TI, Builder);
	DefineTypeSizeof("__SIZEOF_POINTER__", TI.getPointerWidth(0), TI, Builder);
	DefineTypeSizeof("__SIZEOF_SHORT__", TI.getShortWidth(), TI, Builder);
	DefineTypeSizeof("__SIZEOF_PTRDIFF_T__",
	TI.getTypeWidth(TI.getPtrDiffType(0)), TI, Builder);
	DefineTypeSizeof("__SIZEOF_SIZE_T__",
	TI.getTypeWidth(TI.getSizeType()), TI, Builder);
	DefineTypeSizeof("__SIZEOF_WCHAR_T__",
	TI.getTypeWidth(TI.getWCharType()), TI, Builder);
	DefineTypeSizeof("__SIZEOF_WINT_T__",
	TI.getTypeWidth(TI.getWIntType()), TI, Builder);
	if (TI.hasInt128Type())
	DefineTypeSizeof("__SIZEOF_INT128__", 128, TI, Builder);

	DefineType("__INTMAX_TYPE__", TI.getIntMaxType(), Builder);
	DefineFmt("__INTMAX", TI.getIntMaxType(), TI, Builder);
	Builder.defineMacro("__INTMAX_C_SUFFIX__",
	TI.getTypeConstantSuffix(TI.getIntMaxType()));
	DefineType("__UINTMAX_TYPE__", TI.getUIntMaxType(), Builder);
	DefineFmt("__UINTMAX", TI.getUIntMaxType(), TI, Builder);
	Builder.defineMacro("__UINTMAX_C_SUFFIX__",
	TI.getTypeConstantSuffix(TI.getUIntMaxType()));
	DefineTypeWidth("__INTMAX_WIDTH__", TI.getIntMaxType(), TI, Builder);
	DefineType("__PTRDIFF_TYPE__", TI.getPtrDiffType(0), Builder);
	DefineFmt("__PTRDIFF", TI.getPtrDiffType(0), TI, Builder);
	DefineTypeWidth("__PTRDIFF_WIDTH__", TI.getPtrDiffType(0), TI, Builder);
	DefineType("__INTPTR_TYPE__", TI.getIntPtrType(), Builder);
	DefineFmt("__INTPTR", TI.getIntPtrType(), TI, Builder);
	DefineTypeWidth("__INTPTR_WIDTH__", TI.getIntPtrType(), TI, Builder);
	DefineType("__SIZE_TYPE__", TI.getSizeType(), Builder);
	DefineFmt("__SIZE", TI.getSizeType(), TI, Builder);
	DefineTypeWidth("__SIZE_WIDTH__", TI.getSizeType(), TI, Builder);
	DefineType("__WCHAR_TYPE__", TI.getWCharType(), Builder);
	DefineTypeWidth("__WCHAR_WIDTH__", TI.getWCharType(), TI, Builder);
	DefineType("__WINT_TYPE__", TI.getWIntType(), Builder);
	DefineTypeWidth("__WINT_WIDTH__", TI.getWIntType(), TI, Builder);
	DefineTypeWidth("__SIG_ATOMIC_WIDTH__", TI.getSigAtomicType(), TI, Builder);
	DefineTypeSize("__SIG_ATOMIC_MAX__", TI.getSigAtomicType(), TI, Builder);
	DefineType("__CHAR16_TYPE__", TI.getChar16Type(), Builder);
	DefineType("__CHAR32_TYPE__", TI.getChar32Type(), Builder);

	DefineTypeWidth("__UINTMAX_WIDTH__", TI.getUIntMaxType(), TI, Builder);
	DefineType("__UINTPTR_TYPE__", TI.getUIntPtrType(), Builder);
	DefineFmt("__UINTPTR", TI.getUIntPtrType(), TI, Builder);
	DefineTypeWidth("__UINTPTR_WIDTH__", TI.getUIntPtrType(), TI, Builder);

	DefineFloatMacros(Builder, "FLT", &TI.getFloatFormat(), "F");
	DefineFloatMacros(Builder, "DBL", &TI.getDoubleFormat(), "");
	DefineFloatMacros(Builder, "LDBL", &TI.getLongDoubleFormat(), "L");

	// Define a __POINTER_WIDTH__ macro for stdint.h.
	Builder.defineMacro("__POINTER_WIDTH__",
	Twine((int)TI.getPointerWidth(0)));

	// Define __BIGGEST_ALIGNMENT__ to be compatible with gcc.
	Builder.defineMacro("__BIGGEST_ALIGNMENT__",
	Twine(TI.getSuitableAlign() / TI.getCharWidth()) );

	if (!LangOpts.CharIsSigned)
	Builder.defineMacro("__CHAR_UNSIGNED__");

	if (!TargetInfo::isTypeSigned(TI.getWCharType()))
	Builder.defineMacro("__WCHAR_UNSIGNED__");

	if (!TargetInfo::isTypeSigned(TI.getWIntType()))
	Builder.defineMacro("__WINT_UNSIGNED__");

	// Define exact-width integer types for stdint.h
	DefineExactWidthIntType(TargetInfo::SignedChar, TI, Builder);

	if (TI.getShortWidth() > TI.getCharWidth())
	DefineExactWidthIntType(TargetInfo::SignedShort, TI, Builder);

	if (TI.getIntWidth() > TI.getShortWidth())
	DefineExactWidthIntType(TargetInfo::SignedInt, TI, Builder);

	if (TI.getLongWidth() > TI.getIntWidth())
	DefineExactWidthIntType(TargetInfo::SignedLong, TI, Builder);

	if (TI.getLongLongWidth() > TI.getLongWidth())
	DefineExactWidthIntType(TargetInfo::SignedLongLong, TI, Builder);

	DefineExactWidthIntType(TargetInfo::UnsignedChar, TI, Builder);
	DefineExactWidthIntTypeSize(TargetInfo::UnsignedChar, TI, Builder);
	DefineExactWidthIntTypeSize(TargetInfo::SignedChar, TI, Builder);

	if (TI.getShortWidth() > TI.getCharWidth()) {
	DefineExactWidthIntType(TargetInfo::UnsignedShort, TI, Builder);
	DefineExactWidthIntTypeSize(TargetInfo::UnsignedShort, TI, Builder);
	DefineExactWidthIntTypeSize(TargetInfo::SignedShort, TI, Builder);
	}

	if (TI.getIntWidth() > TI.getShortWidth()) {
	DefineExactWidthIntType(TargetInfo::UnsignedInt, TI, Builder);
	DefineExactWidthIntTypeSize(TargetInfo::UnsignedInt, TI, Builder);
	DefineExactWidthIntTypeSize(TargetInfo::SignedInt, TI, Builder);
	}

	if (TI.getLongWidth() > TI.getIntWidth()) {
	DefineExactWidthIntType(TargetInfo::UnsignedLong, TI, Builder);
	DefineExactWidthIntTypeSize(TargetInfo::UnsignedLong, TI, Builder);
	DefineExactWidthIntTypeSize(TargetInfo::SignedLong, TI, Builder);
	}

	if (TI.getLongLongWidth() > TI.getLongWidth()) {
	DefineExactWidthIntType(TargetInfo::UnsignedLongLong, TI, Builder);
	DefineExactWidthIntTypeSize(TargetInfo::UnsignedLongLong, TI, Builder);
	DefineExactWidthIntTypeSize(TargetInfo::SignedLongLong, TI, Builder);
	}

	DefineLeastWidthIntType(8, true, TI, Builder);
	DefineLeastWidthIntType(8, false, TI, Builder);
	DefineLeastWidthIntType(16, true, TI, Builder);
	DefineLeastWidthIntType(16, false, TI, Builder);
	DefineLeastWidthIntType(32, true, TI, Builder);
	DefineLeastWidthIntType(32, false, TI, Builder);
	DefineLeastWidthIntType(64, true, TI, Builder);
	DefineLeastWidthIntType(64, false, TI, Builder);

	DefineFastIntType(8, true, TI, Builder);
	DefineFastIntType(8, false, TI, Builder);
	DefineFastIntType(16, true, TI, Builder);
	DefineFastIntType(16, false, TI, Builder);
	DefineFastIntType(32, true, TI, Builder);
	DefineFastIntType(32, false, TI, Builder);
	DefineFastIntType(64, true, TI, Builder);
	DefineFastIntType(64, false, TI, Builder);

	char UserLabelPrefix[2] = {TI.getDataLayout().getGlobalPrefix(), 0};
	Builder.defineMacro("__USER_LABEL_PREFIX__", UserLabelPrefix);

	if (LangOpts.FastMath \|\| LangOpts.FiniteMathOnly)
	Builder.defineMacro("__FINITE_MATH_ONLY__", "1");
	else
	Builder.defineMacro("__FINITE_MATH_ONLY__", "0");

	if (!LangOpts.MSVCCompat) {
	if (LangOpts.GNUInline \|\| LangOpts.CPlusPlus)
	Builder.defineMacro("__GNUC_GNU_INLINE__");
	else
	Builder.defineMacro("__GNUC_STDC_INLINE__");

	// The value written by __atomic_test_and_set.
	// FIXME: This is target-dependent.
	Builder.defineMacro("__GCC_ATOMIC_TEST_AND_SET_TRUEVAL", "1");

	// Used by libc++ and libstdc++ to implement ATOMIC_<foo>_LOCK_FREE.
	unsigned InlineWidthBits = TI.getMaxAtomicInlineWidth();
	#define DEFINE_LOCK_FREE_MACRO(TYPE, Type) \
	Builder.defineMacro("__GCC_ATOMIC_" #TYPE "_LOCK_FREE", \
	getLockFreeValue(TI.get##Type##Width(), \
	+ TI.get##Type##Align(), \
	InlineWidthBits));
	DEFINE_LOCK_FREE_MACRO(BOOL, Bool);
	DEFINE_LOCK_FREE_MACRO(CHAR, Char);
	DEFINE_LOCK_FREE_MACRO(CHAR16_T, Char16);
	DEFINE_LOCK_FREE_MACRO(CHAR32_T, Char32);
	DEFINE_LOCK_FREE_MACRO(WCHAR_T, WChar);
	DEFINE_LOCK_FREE_MACRO(SHORT, Short);
	DEFINE_LOCK_FREE_MACRO(INT, Int);
	DEFINE_LOCK_FREE_MACRO(LONG, Long);
	DEFINE_LOCK_FREE_MACRO(LLONG, LongLong);
	Builder.defineMacro("__GCC_ATOMIC_POINTER_LOCK_FREE",
	getLockFreeValue(TI.getPointerWidth(0),
	+ TI.getPointerAlign(0),
	InlineWidthBits));
	#undef DEFINE_LOCK_FREE_MACRO
	}

	if (LangOpts.NoInlineDefine)
	Builder.defineMacro("__NO_INLINE__");

	if (unsigned PICLevel = LangOpts.PICLevel) {
	Builder.defineMacro("__PIC__", Twine(PICLevel));
	Builder.defineMacro("__pic__", Twine(PICLevel));
	if (LangOpts.PIE) {
	Builder.defineMacro("__PIE__", Twine(PICLevel));
	Builder.defineMacro("__pie__", Twine(PICLevel));
	}
	}

	// Macros to control C99 numerics and <float.h>
	Builder.defineMacro("__FLT_EVAL_METHOD__", Twine(TI.getFloatEvalMethod()));
	Builder.defineMacro("__FLT_RADIX__", "2");
	Builder.defineMacro("__DECIMAL_DIG__", "__LDBL_DECIMAL_DIG__");

	if (LangOpts.getStackProtector() == LangOptions::SSPOn)
	Builder.defineMacro("__SSP__");
	else if (LangOpts.getStackProtector() == LangOptions::SSPStrong)
	Builder.defineMacro("__SSP_STRONG__", "2");
	else if (LangOpts.getStackProtector() == LangOptions::SSPReq)
	Builder.defineMacro("__SSP_ALL__", "3");

	// Define a macro that exists only when using the static analyzer.
	if (FEOpts.ProgramAction == frontend::RunAnalysis)
	Builder.defineMacro("__clang_analyzer__");

	if (LangOpts.FastRelaxedMath)
	Builder.defineMacro("__FAST_RELAXED_MATH__");

	if (FEOpts.ProgramAction == frontend::RewriteObjC \|\|
	LangOpts.getGC() != LangOptions::NonGC) {
	Builder.defineMacro("__weak", "__attribute__((objc_gc(weak)))");
	Builder.defineMacro("__strong", "__attribute__((objc_gc(strong)))");
	Builder.defineMacro("__autoreleasing", "");
	Builder.defineMacro("__unsafe_unretained", "");
	} else if (LangOpts.ObjC1) {
	Builder.defineMacro("__weak", "__attribute__((objc_ownership(weak)))");
	Builder.defineMacro("__strong", "__attribute__((objc_ownership(strong)))");
	Builder.defineMacro("__autoreleasing",
	"__attribute__((objc_ownership(autoreleasing)))");
	Builder.defineMacro("__unsafe_unretained",
	"__attribute__((objc_ownership(none)))");
	}

	// On Darwin, there are __double_underscored variants of the type
	// nullability qualifiers.
	if (TI.getTriple().isOSDarwin()) {
	Builder.defineMacro("__nonnull", "_Nonnull");
	Builder.defineMacro("__null_unspecified", "_Null_unspecified");
	Builder.defineMacro("__nullable", "_Nullable");
	}

	// OpenMP definition
	// OpenMP 2.2:
	// In implementations that support a preprocessor, the _OPENMP
	// macro name is defined to have the decimal value yyyymm where
	// yyyy and mm are the year and the month designations of the
	// version of the OpenMP API that the implementation support.
	switch (LangOpts.OpenMP) {
	case 0:
	break;
	case 40:
	Builder.defineMacro("_OPENMP", "201307");
	break;
	case 45:
	Builder.defineMacro("_OPENMP", "201511");
	break;
	default:
	// Default version is OpenMP 3.1
	Builder.defineMacro("_OPENMP", "201107");
	break;
	}

	// CUDA device path compilaton
	if (LangOpts.CUDAIsDevice) {
	// The CUDA_ARCH value is set for the GPU target specified in the NVPTX
	// backend's target defines.
	Builder.defineMacro("__CUDA_ARCH__");
	}

	// We need to communicate this to our CUDA header wrapper, which in turn
	// informs the proper CUDA headers of this choice.
	if (LangOpts.CUDADeviceApproxTranscendentals \|\| LangOpts.FastMath) {
	Builder.defineMacro("__CLANG_CUDA_APPROX_TRANSCENDENTALS__");
	}

	// OpenCL definitions.
	if (LangOpts.OpenCL) {
	#define OPENCLEXT(Ext) \
	if (TI.getSupportedOpenCLOpts().isSupported(#Ext, \
	LangOpts.OpenCLVersion)) \
	Builder.defineMacro(#Ext);
	#include "clang/Basic/OpenCLExtensions.def"
	}

	if (TI.hasInt128Type() && LangOpts.CPlusPlus && LangOpts.GNUMode) {
	// For each extended integer type, g++ defines a macro mapping the
	// index of the type (0 in this case) in some list of extended types
	// to the type.
	Builder.defineMacro("__GLIBCXX_TYPE_INT_N_0", "__int128");
	Builder.defineMacro("__GLIBCXX_BITSIZE_INT_N_0", "128");
	}

	// Get other target #defines.
	TI.getTargetDefines(LangOpts, Builder);
	}

	/// InitializePreprocessor - Initialize the preprocessor getting it and the
	/// environment ready to process a single file. This returns true on error.
	///
	void clang::InitializePreprocessor(
	Preprocessor &PP, const PreprocessorOptions &InitOpts,
	const PCHContainerReader &PCHContainerRdr,
	const FrontendOptions &FEOpts) {
	const LangOptions &LangOpts = PP.getLangOpts();
	std::string PredefineBuffer;
	PredefineBuffer.reserve(4080);
	llvm::raw_string_ostream Predefines(PredefineBuffer);
	MacroBuilder Builder(Predefines);

	// Emit line markers for various builtin sections of the file. We don't do
	// this in asm preprocessor mode, because "# 4" is not a line marker directive
	// in this mode.
	if (!PP.getLangOpts().AsmPreprocessor)
	Builder.append("# 1 \"<built-in>\" 3");

	// Install things like __POWERPC__, __GNUC__, etc into the macro table.
	if (InitOpts.UsePredefines) {
	if (LangOpts.CUDA && PP.getAuxTargetInfo())
	InitializePredefinedMacros(*PP.getAuxTargetInfo(), LangOpts, FEOpts,
	Builder);

	InitializePredefinedMacros(PP.getTargetInfo(), LangOpts, FEOpts, Builder);

	// Install definitions to make Objective-C++ ARC work well with various
	// C++ Standard Library implementations.
	if (LangOpts.ObjC1 && LangOpts.CPlusPlus &&
	(LangOpts.ObjCAutoRefCount \|\| LangOpts.ObjCWeak)) {
	switch (InitOpts.ObjCXXARCStandardLibrary) {
	case ARCXX_nolib:
	case ARCXX_libcxx:
	break;

	case ARCXX_libstdcxx:
	AddObjCXXARCLibstdcxxDefines(LangOpts, Builder);
	break;
	}
	}
	}

	// Even with predefines off, some macros are still predefined.
	// These should all be defined in the preprocessor according to the
	// current language configuration.
	InitializeStandardPredefinedMacros(PP.getTargetInfo(), PP.getLangOpts(),
	FEOpts, Builder);

	// Add on the predefines from the driver. Wrap in a #line directive to report
	// that they come from the command line.
	if (!PP.getLangOpts().AsmPreprocessor)
	Builder.append("# 1 \"<command line>\" 1");

	// Process #define's and #undef's in the order they are given.
	for (unsigned i = 0, e = InitOpts.Macros.size(); i != e; ++i) {
	if (InitOpts.Macros[i].second) // isUndef
	Builder.undefineMacro(InitOpts.Macros[i].first);
	else
	DefineBuiltinMacro(Builder, InitOpts.Macros[i].first,
	PP.getDiagnostics());
	}

	// Exit the command line and go back to <built-in> (2 is LC_LEAVE).
	if (!PP.getLangOpts().AsmPreprocessor)
	Builder.append("# 1 \"<built-in>\" 2");

	// If -imacros are specified, include them now. These are processed before
	// any -include directives.
	for (unsigned i = 0, e = InitOpts.MacroIncludes.size(); i != e; ++i)
	AddImplicitIncludeMacros(Builder, InitOpts.MacroIncludes[i]);

	// Process -include-pch/-include-pth directives.
	if (!InitOpts.ImplicitPCHInclude.empty())
	AddImplicitIncludePCH(Builder, PP, PCHContainerRdr,
	InitOpts.ImplicitPCHInclude);
	if (!InitOpts.ImplicitPTHInclude.empty())
	AddImplicitIncludePTH(Builder, PP, InitOpts.ImplicitPTHInclude);

	// Process -include directives.
	for (unsigned i = 0, e = InitOpts.Includes.size(); i != e; ++i) {
	const std::string &Path = InitOpts.Includes[i];
	AddImplicitInclude(Builder, Path);
	}

	// Instruct the preprocessor to skip the preamble.
	PP.setSkipMainFilePreamble(InitOpts.PrecompiledPreambleBytes.first,
	InitOpts.PrecompiledPreambleBytes.second);

	// Copy PredefinedBuffer into the Preprocessor.
	PP.setPredefines(Predefines.str());
	}
	Index: projects/clang400-import/contrib/llvm/tools/clang/lib/StaticAnalyzer/Checkers/VirtualCallChecker.cpp
	===================================================================
	--- projects/clang400-import/contrib/llvm/tools/clang/lib/StaticAnalyzer/Checkers/VirtualCallChecker.cpp (revision 314268)
	+++ projects/clang400-import/contrib/llvm/tools/clang/lib/StaticAnalyzer/Checkers/VirtualCallChecker.cpp (revision 314269)
	@@ -1,291 +1,292 @@
	//=======- VirtualCallChecker.cpp --------------------------------- C++ --==//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	//
	// This file defines a checker that checks virtual function calls during
	// construction or destruction of C++ objects.
	//
	//===----------------------------------------------------------------------===//

	#include "ClangSACheckers.h"
	#include "clang/AST/DeclCXX.h"
	#include "clang/AST/StmtVisitor.h"
	#include "clang/StaticAnalyzer/Core/BugReporter/BugReporter.h"
	#include "clang/StaticAnalyzer/Core/Checker.h"
	#include "clang/StaticAnalyzer/Core/PathSensitive/AnalysisManager.h"
	#include "llvm/ADT/SmallString.h"
	#include "llvm/Support/SaveAndRestore.h"
	#include "llvm/Support/raw_ostream.h"

	using namespace clang;
	using namespace ento;

	namespace {

	class WalkAST : public StmtVisitor<WalkAST> {
	const CheckerBase *Checker;
	BugReporter &BR;
	AnalysisDeclContext *AC;

	/// The root constructor or destructor whose callees are being analyzed.
	const CXXMethodDecl *RootMethod = nullptr;

	/// Whether the checker should walk into bodies of called functions.
	/// Controlled by the "Interprocedural" analyzer-config option.
	bool IsInterprocedural = false;

	/// Whether the checker should only warn for calls to pure virtual functions
	/// (which is undefined behavior) or for all virtual functions (which may
	/// may result in unexpected behavior).
	bool ReportPureOnly = false;

	typedef const CallExpr * WorkListUnit;
	typedef SmallVector<WorkListUnit, 20> DFSWorkList;

	/// A vector representing the worklist which has a chain of CallExprs.
	DFSWorkList WList;

	// PreVisited : A CallExpr to this FunctionDecl is in the worklist, but the
	// body has not been visited yet.
	// PostVisited : A CallExpr to this FunctionDecl is in the worklist, and the
	// body has been visited.
	enum Kind { NotVisited,
	PreVisited, /**< A CallExpr to this FunctionDecl is in the
	worklist, but the body has not yet been
	visited. */
	PostVisited /**< A CallExpr to this FunctionDecl is in the
	worklist, and the body has been visited. */
	};

	/// A DenseMap that records visited states of FunctionDecls.
	llvm::DenseMap<const FunctionDecl *, Kind> VisitedFunctions;

	/// The CallExpr whose body is currently being visited. This is used for
	/// generating bug reports. This is null while visiting the body of a
	/// constructor or destructor.
	const CallExpr *visitingCallExpr;

	public:
	WalkAST(const CheckerBase checker, BugReporter &br, AnalysisDeclContext ac,
	const CXXMethodDecl *rootMethod, bool isInterprocedural,
	bool reportPureOnly)
	: Checker(checker), BR(br), AC(ac), RootMethod(rootMethod),
	IsInterprocedural(isInterprocedural), ReportPureOnly(reportPureOnly),
	visitingCallExpr(nullptr) {
	// Walking should always start from either a constructor or a destructor.
	assert(isa<CXXConstructorDecl>(rootMethod) \|\|
	isa<CXXDestructorDecl>(rootMethod));
	}

	bool hasWork() const { return !WList.empty(); }

	/// This method adds a CallExpr to the worklist and marks the callee as
	/// being PreVisited.
	void Enqueue(WorkListUnit WLUnit) {
	const FunctionDecl *FD = WLUnit->getDirectCallee();
	if (!FD \|\| !FD->getBody())
	return;
	Kind &K = VisitedFunctions[FD];
	if (K != NotVisited)
	return;
	K = PreVisited;
	WList.push_back(WLUnit);
	}

	/// This method returns an item from the worklist without removing it.
	WorkListUnit Dequeue() {
	assert(!WList.empty());
	return WList.back();
	}

	void Execute() {
	while (hasWork()) {
	WorkListUnit WLUnit = Dequeue();
	const FunctionDecl *FD = WLUnit->getDirectCallee();
	assert(FD && FD->getBody());

	if (VisitedFunctions[FD] == PreVisited) {
	// If the callee is PreVisited, walk its body.
	// Visit the body.
	SaveAndRestore<const CallExpr *> SaveCall(visitingCallExpr, WLUnit);
	Visit(FD->getBody());

	// Mark the function as being PostVisited to indicate we have
	// scanned the body.
	VisitedFunctions[FD] = PostVisited;
	continue;
	}

	// Otherwise, the callee is PostVisited.
	// Remove it from the worklist.
	assert(VisitedFunctions[FD] == PostVisited);
	WList.pop_back();
	}
	}

	// Stmt visitor methods.
	void VisitCallExpr(CallExpr *CE);
	void VisitCXXMemberCallExpr(CallExpr *CE);
	void VisitStmt(Stmt *S) { VisitChildren(S); }
	void VisitChildren(Stmt *S);

	void ReportVirtualCall(const CallExpr *CE, bool isPure);

	};
	} // end anonymous namespace

	//===----------------------------------------------------------------------===//
	// AST walking.
	//===----------------------------------------------------------------------===//

	void WalkAST::VisitChildren(Stmt *S) {
	for (Stmt *Child : S->children())
	if (Child)
	Visit(Child);
	}

	void WalkAST::VisitCallExpr(CallExpr *CE) {
	VisitChildren(CE);
	if (IsInterprocedural)
	Enqueue(CE);
	}

	void WalkAST::VisitCXXMemberCallExpr(CallExpr *CE) {
	VisitChildren(CE);
	bool callIsNonVirtual = false;

	// Several situations to elide for checking.
	if (MemberExpr *CME = dyn_cast<MemberExpr>(CE->getCallee())) {
	// If the member access is fully qualified (i.e., X::F), then treat
	// this as a non-virtual call and do not warn.
	if (CME->getQualifier())
	callIsNonVirtual = true;

	if (Expr *base = CME->getBase()->IgnoreImpCasts()) {
	// Elide analyzing the call entirely if the base pointer is not 'this'.
	if (!isa<CXXThisExpr>(base))
	return;

	// If the most derived class is marked final, we know that now subclass
	// can override this member.
	if (base->getBestDynamicClassType()->hasAttr<FinalAttr>())
	callIsNonVirtual = true;
	}
	}

	// Get the callee.
	- const CXXMethodDecl *MD = dyn_cast<CXXMethodDecl>(CE->getDirectCallee());
	+ const CXXMethodDecl *MD =
	+ dyn_cast_or_null<CXXMethodDecl>(CE->getDirectCallee());
	if (MD && MD->isVirtual() && !callIsNonVirtual && !MD->hasAttr<FinalAttr>() &&
	!MD->getParent()->hasAttr<FinalAttr>())
	ReportVirtualCall(CE, MD->isPure());

	if (IsInterprocedural)
	Enqueue(CE);
	}

	void WalkAST::ReportVirtualCall(const CallExpr *CE, bool isPure) {
	if (ReportPureOnly && !isPure)
	return;

	SmallString<100> buf;
	llvm::raw_svector_ostream os(buf);

	// FIXME: The interprocedural diagnostic experience here is not good.
	// Ultimately this checker should be re-written to be path sensitive.
	// For now, only diagnose intraprocedurally, by default.
	if (IsInterprocedural) {
	os << "Call Path : ";
	// Name of current visiting CallExpr.
	os << *CE->getDirectCallee();

	// Name of the CallExpr whose body is current being walked.
	if (visitingCallExpr)
	os << " <-- " << *visitingCallExpr->getDirectCallee();
	// Names of FunctionDecls in worklist with state PostVisited.
	for (SmallVectorImpl<const CallExpr *>::iterator I = WList.end(),
	E = WList.begin(); I != E; --I) {
	const FunctionDecl FD = ((I-1))->getDirectCallee();
	assert(FD);
	if (VisitedFunctions[FD] == PostVisited)
	os << " <-- " << *FD;
	}

	os << "\n";
	}

	PathDiagnosticLocation CELoc =
	PathDiagnosticLocation::createBegin(CE, BR.getSourceManager(), AC);
	SourceRange R = CE->getCallee()->getSourceRange();

	os << "Call to ";
	if (isPure)
	os << "pure ";

	os << "virtual function during ";

	if (isa<CXXConstructorDecl>(RootMethod))
	os << "construction ";
	else
	os << "destruction ";

	if (isPure)
	os << "has undefined behavior";
	else
	os << "will not dispatch to derived class";

	BR.EmitBasicReport(AC->getDecl(), Checker,
	"Call to virtual function during construction or "
	"destruction",
	"C++ Object Lifecycle", os.str(), CELoc, R);
	}

	//===----------------------------------------------------------------------===//
	// VirtualCallChecker
	//===----------------------------------------------------------------------===//

	namespace {
	class VirtualCallChecker : public Checker<check::ASTDecl<CXXRecordDecl> > {
	public:
	DefaultBool isInterprocedural;
	DefaultBool isPureOnly;

	void checkASTDecl(const CXXRecordDecl *RD, AnalysisManager& mgr,
	BugReporter &BR) const {
	AnalysisDeclContext *ADC = mgr.getAnalysisDeclContext(RD);

	// Check the constructors.
	for (const auto *I : RD->ctors()) {
	if (!I->isCopyOrMoveConstructor())
	if (Stmt *Body = I->getBody()) {
	WalkAST walker(this, BR, ADC, I, isInterprocedural, isPureOnly);
	walker.Visit(Body);
	walker.Execute();
	}
	}

	// Check the destructor.
	if (CXXDestructorDecl *DD = RD->getDestructor())
	if (Stmt *Body = DD->getBody()) {
	WalkAST walker(this, BR, ADC, DD, isInterprocedural, isPureOnly);
	walker.Visit(Body);
	walker.Execute();
	}
	}
	};
	}

	void ento::registerVirtualCallChecker(CheckerManager &mgr) {
	VirtualCallChecker *checker = mgr.registerChecker<VirtualCallChecker>();
	checker->isInterprocedural =
	mgr.getAnalyzerOptions().getBooleanOption("Interprocedural", false,
	checker);

	checker->isPureOnly =
	mgr.getAnalyzerOptions().getBooleanOption("PureOnly", false,
	checker);
	}
	Index: projects/clang400-import/contrib/llvm/tools/clang
	===================================================================
	--- projects/clang400-import/contrib/llvm/tools/clang (revision 314268)
	+++ projects/clang400-import/contrib/llvm/tools/clang (revision 314269)

	Property changes on: projects/clang400-import/contrib/llvm/tools/clang
	___________________________________________________________________
	Modified: svn:mergeinfo
	## -0,0 +0,1 ##
	Merged /vendor/clang/dist:r314176-314267
	Index: projects/clang400-import/contrib/llvm/tools/lld
	===================================================================
	--- projects/clang400-import/contrib/llvm/tools/lld (revision 314268)
	+++ projects/clang400-import/contrib/llvm/tools/lld (revision 314269)

	Property changes on: projects/clang400-import/contrib/llvm/tools/lld
	___________________________________________________________________
	Modified: svn:mergeinfo
	## -0,0 +0,1 ##
	Merged /vendor/lld/dist:r314176-314268
	Index: projects/clang400-import/contrib/llvm/tools/lldb
	===================================================================
	--- projects/clang400-import/contrib/llvm/tools/lldb (revision 314268)
	+++ projects/clang400-import/contrib/llvm/tools/lldb (revision 314269)

	Property changes on: projects/clang400-import/contrib/llvm/tools/lldb
	___________________________________________________________________
	Modified: svn:mergeinfo
	## -0,0 +0,1 ##
	Merged /vendor/lldb/dist:r314176-314268
	Index: projects/clang400-import/contrib/llvm
	===================================================================
	--- projects/clang400-import/contrib/llvm (revision 314268)
	+++ projects/clang400-import/contrib/llvm (revision 314269)

	Property changes on: projects/clang400-import/contrib/llvm
	___________________________________________________________________
	Modified: svn:mergeinfo
	## -0,0 +0,1 ##
	Merged /vendor/llvm/dist:r314176-314267
	Index: projects/clang400-import/lib/clang/include/clang/Basic/Version.inc
	===================================================================
	--- projects/clang400-import/lib/clang/include/clang/Basic/Version.inc (revision 314268)
	+++ projects/clang400-import/lib/clang/include/clang/Basic/Version.inc (revision 314269)
	@@ -1,11 +1,11 @@
	/* $FreeBSD$ */

	#define CLANG_VERSION 4.0.0
	#define CLANG_VERSION_STRING "4.0.0"
	#define CLANG_VERSION_MAJOR 4
	#define CLANG_VERSION_MINOR 0
	#define CLANG_VERSION_PATCHLEVEL 0

	#define CLANG_VENDOR "FreeBSD "

	-#define SVN_REVISION "296002"
	+#define SVN_REVISION "296202"
	Index: projects/clang400-import/lib/clang/include/lld/Config/Version.inc
	===================================================================
	--- projects/clang400-import/lib/clang/include/lld/Config/Version.inc (revision 314268)
	+++ projects/clang400-import/lib/clang/include/lld/Config/Version.inc (revision 314269)
	@@ -1,8 +1,8 @@
	// $FreeBSD$

	#define LLD_VERSION 4.0.0
	#define LLD_VERSION_STRING "4.0.0"
	#define LLD_VERSION_MAJOR 4
	#define LLD_VERSION_MINOR 0
	-#define LLD_REVISION_STRING "296002"
	+#define LLD_REVISION_STRING "296202"
	#define LLD_REPOSITORY_STRING "FreeBSD"

File Metadata

Mime Type: text/x-c++
Expires: Fri, Nov 7, 7:01 AM (2 d)
Storage Engine: blob
Storage Format: Raw Data
Storage Handle: 24918614
Default Alt Text: (706 KB)

No OneTemporaryActions

View Options

File Metadata

Event Timeline

No OneTemporary
Actions